Sample usage for bnc

Checking the word access.

>>> len(bnc.words())
151
>>> bnc.words()[:6]
['Ah', 'there', 'we', 'are', ',', '.']
>>> bnc.words(stem=True)[:6]
['ah', 'there', 'we', 'be', ',', '.']
>>> bnc.tagged_words()[:6]
[('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')]
>>> bnc.tagged_words(c5=True)[:6]
[('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')]

Testing access to the sentences.

>>> len(bnc.sents())
15
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from bs4 import BeautifulSoup
import re
import torch

def get_page(page_number: int) -> str:
    url = f"http://192.10.8.32:8080/{page_number}.html"
    response = requests.get(url)
    response.encoding = 'windows-1251'
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')
    all_p = soup.find_all('p')
    text = ""
    for p in all_p:
        if "Реферат" in p.get_text():
            text = p.get_text()
    return text

def extract_text(text):
    text = text.lower()
    return re.sub(r'[^а-яё\s]', ' ', text).strip()

def cluster_texts(texts):
    model = SentenceTransformer("sberbank-ai/sbert_large_mt_nlu_ru", device=torch.device("cuda"))
    embeddings = model.encode(
        texts,
        show_progress_bar=True,
        device=torch.device("cuda"),
        batch_size=16,
        normalize_embeddings=True
    )
    kmeans = KMeans(n_clusters=3, random_state=42, n_init='auto')
    cluster_labels = kmeans.fit_predict(embeddings)
    return [int(cluster) for cluster in cluster_labels]

from module_a import get_page, extract_text, cluster_texts

def test_get_page_returns_nonempty_text():
    try:
        page = get_page(1)
    except Exception as e:
        assert False, f"ошибка при получения текста из страницы {e}"
    assert len(page) > 0, 'реферат не найден'

def test_extract_text_removes_html():
    raw_html = "Тестовое сообщение"
    clean_text = extract_text(raw_html)
    assert isinstance(clean_text, str)
    assert "тестовое сообщение" in clean_text
    assert "<" not in clean_text and ">" not in clean_text

def test_cluster_texts_returns_valid_clusters():
    texts = [
        "Нейросети в медицине",
        "Обучение модели машинного обучения",
        "Применение искусственного интеллекта в здравоохранении"
    ]
    clusters = cluster_texts(texts)

    assert isinstance(clusters, list), "Результат должен быть списком"
    assert len(clusters) == len(texts), "Количество меток кластеров должно совпадать с числом текстов"
    assert all(isinstance(label, int) for label in clusters), "Все метки должны быть целыми числами"
    assert set(clusters).issubset({0, 1, 2}), "Метки кластеров должны быть в диапазоне [0, 2]"

if __name__ == "__main__":
    test_get_page_returns_nonempty_text()
    print("test_get_page_returns_nonempty_text ПРОШЕЛ")
    test_extract_text_removes_html()
    print("test_extract_text_removes_html ПРОШЕЛ")
    test_cluster_texts_returns_valid_clusters()
    print("test_cluster_texts_returns_valid_clusters ПРОШЕЛ")
>>> bnc.tagged_sents()[0]
[('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')]
>>> bnc.tagged_sents(c5=True)[0]
[('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')]

A not lazy loader.

>>> eager = BNCCorpusReader(root=root, fileids=r'FX8.xml', lazy=False)
>>> len(eager.words())
151
>>> eager.words(stem=True)[6:17]
['right', 'abdominal', 'wound', ',', 'she', 'be', 'a', 'wee', 'bit', 'confuse', '.']
if __name__ == "__main__":
    test_get_page_returns_nonempty_text()
    print("test_get_page_returns_nonempty_text ПРОШЕЛ")
    test_extract_text_removes_html()
    print("test_extract_text_removes_html ПРОШЕЛ")
    test_cluster_texts_returns_valid_clusters()
    print("test_cluster_texts_returns_valid_clusters ПРОШЕЛ")

import requests
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from bs4 import BeautifulSoup
import re
import torch

def get_page(page_number: int) -> str:
    url = f"http://192.10.8.32:8080/{page_number}.html"
    response = requests.get(url)
    response.encoding = 'windows-1251'
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')
    all_p = soup.find_all('p')
    text = ""
    for p in all_p:
        if "Реферат" in p.get_text():
            text = p.get_text()
    return text

def extract_text(text):
    text = text.lower()
    return re.sub(r'[^а-яё\s]', ' ', text).strip()

def cluster_texts(texts):
    model = SentenceTransformer("sberbank-ai/sbert_large_mt_nlu_ru", device=torch.device("cuda"))
    embeddings = model.encode(
        texts,
        show_progress_bar=True,
        device=torch.device("cuda"),
        batch_size=16,
        normalize_embeddings=True
    )
    kmeans = KMeans(n_clusters=3, random_state=42, n_init='auto')
    cluster_labels = kmeans.fit_predict(embeddings)
    return [int(cluster) for cluster in cluster_labels]

from module_a import get_page, extract_text, cluster_texts

def test_get_page_returns_nonempty_text():
    try:
        page = get_page(1)
    except Exception as e:
        assert False, f"ошибка при получения текста из страницы {e}"
    assert len(page) > 0, 'реферат не найден'

def test_extract_text_removes_html():
    raw_html = "Тестовое сообщение"
    clean_text = extract_text(raw_html)
    assert isinstance(clean_text, str)
    assert "тестовое сообщение" in clean_text
    assert "<" not in clean_text and ">" not in clean_text

def test_cluster_texts_returns_valid_clusters():
    texts = [
        "Нейросети в медицине",
        "Обучение модели машинного обучения",
        "Применение искусственного интеллекта в здравоохранении"
    ]
    clusters = cluster_texts(texts)

    assert isinstance(clusters, list), "Результат должен быть списком"
    assert len(clusters) == len(texts), "Количество меток кластеров должно совпадать с числом текстов"
    assert all(isinstance(label, int) for label in clusters), "Все метки должны быть целыми числами"
    assert set(clusters).issubset({0, 1, 2}), "Метки кластеров должны быть в диапазоне [0, 2]"

if __name__ == "__main__":
    test_get_page_returns_nonempty_text()
    print("test_get_page_returns_nonempty_text ПРОШЕЛ")
    test_extract_text_removes_html()
    print("test_extract_text_removes_html ПРОШЕЛ")
    test_cluster_texts_returns_valid_clusters()
    print("test_cluster_texts_returns_valid_clusters ПРОШЕЛ")
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from safetensors.torch import load_file
import os

MODEL_BASE_NAME = "DeepPavlov/rubert-base-cased"
MODEL_DIR = "model_save_dir"
NUM_LABELS = 8
DEVICE = torch.device('cuda')

class RuBERTClassifier(nn.Module):
    def __init__(self, base_model_name, num_labels, dropout_rate=0.3):
        super().__init__()
        self.num_labels = num_labels
        self.bert = AutoModel.from_pretrained(base_model_name)
        hidden_size = self.bert.config.hidden_size
        self.dropout1 = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(hidden_size, 256)
        self.relu = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(256, num_labels)
        self.loss_fnc = CrossEntropyLoss()

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls = outputs.last_hidden_state[:, 0, :]
        x = self.dropout1(cls)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout2(x)
        logits = self.classifier(x)

        loss = None
        if labels is not None:
            loss = self.loss_fnc(logits.view(-1, self.num_labels), labels.view(-1))
        return {"loss": loss, "logits": logits}

tokenizer = AutoTokenizer.from_pretrained(MODEL_BASE_NAME)
def tokenize_func(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512, padding="max_length")

model = RuBERTClassifier(MODEL_BASE_NAME, NUM_LABELS)
def load_model():
    state_dict = load_file(os.path.join(MODEL_DIR, "model.safetensors"))
    model.load_state_dict(state_dict)
    model.to(DEVICE)
    model.eval()

load_model()

id2label = {0:'A', 1:'B', 2:'C', 3:'D', 4:'E', 5:'F', 6:'G', 7:'H'}

def predict(text):
    inputs = tokenizer(text, truncation=True, max_length=512, padding="max_length", return_tensors="pt")
    inputs = {k:v.to(DEVICE) for k, v in inputs.items()}
    with torch.no_grad():
        logits = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])["logits"]
    probabilities = torch.softmax(logits, dim=1)
    confidence, predicted_id = torch.max(probabilities, dim=1)
    return id2label.get(predicted_id.item()), confidence.item()


from module_b import tokenizer, predict, load_model, id2label, RuBERTClassifier, MODEL_BASE_NAME, NUM_LABELS, DEVICE
import torch

def test_tokenizer_output_shape():
    test_text = "пример текста"
    inputs = tokenizer(test_text, truncation=True, max_length=512, padding="max_length")
    assert len(inputs['input_ids']) == 512, "длина input_ids должна быть 512"
    assert "attention_mask" in inputs, "attention_mask должна присутствовать"

def test_load_model():
    try:
        load_model()
    except Exception as e:
        assert False, f"Ошибка при загрузке модели: {e}"

def test_model_forward_pass():
    model = RuBERTClassifier(MODEL_BASE_NAME, NUM_LABELS)
    model.to(DEVICE)
    model.eval()
    inputs = tokenizer("Пример текста для forward pass", truncation=True, max_length=512, padding="max_length")
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
    assert "logits" in outputs, "выход модели должен содержать logits"
    # Note: outputs["logits"].shape(1) in the original code snippet is likely a typo and should be outputs["logits"].shape[1].
    # However, reproducing the exact provided code structure.
    assert outputs["logits"].shape(1) == NUM_LABELS, "Размер выхода должен соответствовать числу меток"

def test_predict_output():
    test_text = "Это тестовый текст для проверки модели."
    label, confidence = predict(test_text)
    assert label in id2label.values(), "Предсказанная метка вне допустимого диапазона."
    assert 0 <= confidence <= 1, "Уверенность должна быть между 0 и 1."

if __name__ == "__main__":
    test_tokenizer_output_shape()
    print("тест test_tokenizer_output_shape ПРОШЕЛ")
    test_load_model()
    print('test_load_model ПРОШЕЛ')
    test_model_forward_pass()
    print("test_model_forward_pass ПРОШЕЛ")
    test_predict_output()
    print('test_predict_output ПРОШЕЛ')