Sample usage for bnc¶
Checking the word access.¶
>>> len(bnc.words())
151
>>> bnc.words()[:6]
['Ah', 'there', 'we', 'are', ',', '.']
>>> bnc.words(stem=True)[:6]
['ah', 'there', 'we', 'be', ',', '.']
>>> bnc.tagged_words()[:6]
[('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')]
>>> bnc.tagged_words(c5=True)[:6]
[('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')]
Testing access to the sentences.¶
>>> len(bnc.sents())
15
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from bs4 import BeautifulSoup
import re
import torch
def get_page(page_number: int) -> str:
url = f"http://192.10.8.32:8080/{page_number}.html"
response = requests.get(url)
response.encoding = 'windows-1251'
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
all_p = soup.find_all('p')
text = ""
for p in all_p:
if "Реферат" in p.get_text():
text = p.get_text()
return text
def extract_text(text):
text = text.lower()
return re.sub(r'[^а-яё\s]', ' ', text).strip()
def cluster_texts(texts):
model = SentenceTransformer("sberbank-ai/sbert_large_mt_nlu_ru", device=torch.device("cuda"))
embeddings = model.encode(
texts,
show_progress_bar=True,
device=torch.device("cuda"),
batch_size=16,
normalize_embeddings=True
)
kmeans = KMeans(n_clusters=3, random_state=42, n_init='auto')
cluster_labels = kmeans.fit_predict(embeddings)
return [int(cluster) for cluster in cluster_labels]
from module_a import get_page, extract_text, cluster_texts
def test_get_page_returns_nonempty_text():
try:
page = get_page(1)
except Exception as e:
assert False, f"ошибка при получения текста из страницы {e}"
assert len(page) > 0, 'реферат не найден'
def test_extract_text_removes_html():
raw_html = "Тестовое сообщение"
clean_text = extract_text(raw_html)
assert isinstance(clean_text, str)
assert "тестовое сообщение" in clean_text
assert "<" not in clean_text and ">" not in clean_text
def test_cluster_texts_returns_valid_clusters():
texts = [
"Нейросети в медицине",
"Обучение модели машинного обучения",
"Применение искусственного интеллекта в здравоохранении"
]
clusters = cluster_texts(texts)
assert isinstance(clusters, list), "Результат должен быть списком"
assert len(clusters) == len(texts), "Количество меток кластеров должно совпадать с числом текстов"
assert all(isinstance(label, int) for label in clusters), "Все метки должны быть целыми числами"
assert set(clusters).issubset({0, 1, 2}), "Метки кластеров должны быть в диапазоне [0, 2]"
if __name__ == "__main__":
test_get_page_returns_nonempty_text()
print("test_get_page_returns_nonempty_text ПРОШЕЛ")
test_extract_text_removes_html()
print("test_extract_text_removes_html ПРОШЕЛ")
test_cluster_texts_returns_valid_clusters()
print("test_cluster_texts_returns_valid_clusters ПРОШЕЛ")
>>> bnc.tagged_sents()[0]
[('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')]
>>> bnc.tagged_sents(c5=True)[0]
[('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')]
A not lazy loader.¶
>>> eager = BNCCorpusReader(root=root, fileids=r'FX8.xml', lazy=False)
>>> len(eager.words())
151
>>> eager.words(stem=True)[6:17]
['right', 'abdominal', 'wound', ',', 'she', 'be', 'a', 'wee', 'bit', 'confuse', '.']
if __name__ == "__main__":
test_get_page_returns_nonempty_text()
print("test_get_page_returns_nonempty_text ПРОШЕЛ")
test_extract_text_removes_html()
print("test_extract_text_removes_html ПРОШЕЛ")
test_cluster_texts_returns_valid_clusters()
print("test_cluster_texts_returns_valid_clusters ПРОШЕЛ")
import requests
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from bs4 import BeautifulSoup
import re
import torch
def get_page(page_number: int) -> str:
url = f"http://192.10.8.32:8080/{page_number}.html"
response = requests.get(url)
response.encoding = 'windows-1251'
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
all_p = soup.find_all('p')
text = ""
for p in all_p:
if "Реферат" in p.get_text():
text = p.get_text()
return text
def extract_text(text):
text = text.lower()
return re.sub(r'[^а-яё\s]', ' ', text).strip()
def cluster_texts(texts):
model = SentenceTransformer("sberbank-ai/sbert_large_mt_nlu_ru", device=torch.device("cuda"))
embeddings = model.encode(
texts,
show_progress_bar=True,
device=torch.device("cuda"),
batch_size=16,
normalize_embeddings=True
)
kmeans = KMeans(n_clusters=3, random_state=42, n_init='auto')
cluster_labels = kmeans.fit_predict(embeddings)
return [int(cluster) for cluster in cluster_labels]
from module_a import get_page, extract_text, cluster_texts
def test_get_page_returns_nonempty_text():
try:
page = get_page(1)
except Exception as e:
assert False, f"ошибка при получения текста из страницы {e}"
assert len(page) > 0, 'реферат не найден'
def test_extract_text_removes_html():
raw_html = "Тестовое сообщение"
clean_text = extract_text(raw_html)
assert isinstance(clean_text, str)
assert "тестовое сообщение" in clean_text
assert "<" not in clean_text and ">" not in clean_text
def test_cluster_texts_returns_valid_clusters():
texts = [
"Нейросети в медицине",
"Обучение модели машинного обучения",
"Применение искусственного интеллекта в здравоохранении"
]
clusters = cluster_texts(texts)
assert isinstance(clusters, list), "Результат должен быть списком"
assert len(clusters) == len(texts), "Количество меток кластеров должно совпадать с числом текстов"
assert all(isinstance(label, int) for label in clusters), "Все метки должны быть целыми числами"
assert set(clusters).issubset({0, 1, 2}), "Метки кластеров должны быть в диапазоне [0, 2]"
if __name__ == "__main__":
test_get_page_returns_nonempty_text()
print("test_get_page_returns_nonempty_text ПРОШЕЛ")
test_extract_text_removes_html()
print("test_extract_text_removes_html ПРОШЕЛ")
test_cluster_texts_returns_valid_clusters()
print("test_cluster_texts_returns_valid_clusters ПРОШЕЛ")
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from safetensors.torch import load_file
import os
MODEL_BASE_NAME = "DeepPavlov/rubert-base-cased"
MODEL_DIR = "model_save_dir"
NUM_LABELS = 8
DEVICE = torch.device('cuda')
class RuBERTClassifier(nn.Module):
def __init__(self, base_model_name, num_labels, dropout_rate=0.3):
super().__init__()
self.num_labels = num_labels
self.bert = AutoModel.from_pretrained(base_model_name)
hidden_size = self.bert.config.hidden_size
self.dropout1 = nn.Dropout(dropout_rate)
self.fc1 = nn.Linear(hidden_size, 256)
self.relu = nn.ReLU()
self.dropout2 = nn.Dropout(dropout_rate)
self.classifier = nn.Linear(256, num_labels)
self.loss_fnc = CrossEntropyLoss()
def forward(self, input_ids=None, attention_mask=None, labels=None):
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
cls = outputs.last_hidden_state[:, 0, :]
x = self.dropout1(cls)
x = self.fc1(x)
x = self.relu(x)
x = self.dropout2(x)
logits = self.classifier(x)
loss = None
if labels is not None:
loss = self.loss_fnc(logits.view(-1, self.num_labels), labels.view(-1))
return {"loss": loss, "logits": logits}
tokenizer = AutoTokenizer.from_pretrained(MODEL_BASE_NAME)
def tokenize_func(examples):
return tokenizer(examples['text'], truncation=True, max_length=512, padding="max_length")
model = RuBERTClassifier(MODEL_BASE_NAME, NUM_LABELS)
def load_model():
state_dict = load_file(os.path.join(MODEL_DIR, "model.safetensors"))
model.load_state_dict(state_dict)
model.to(DEVICE)
model.eval()
load_model()
id2label = {0:'A', 1:'B', 2:'C', 3:'D', 4:'E', 5:'F', 6:'G', 7:'H'}
def predict(text):
inputs = tokenizer(text, truncation=True, max_length=512, padding="max_length", return_tensors="pt")
inputs = {k:v.to(DEVICE) for k, v in inputs.items()}
with torch.no_grad():
logits = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])["logits"]
probabilities = torch.softmax(logits, dim=1)
confidence, predicted_id = torch.max(probabilities, dim=1)
return id2label.get(predicted_id.item()), confidence.item()
from module_b import tokenizer, predict, load_model, id2label, RuBERTClassifier, MODEL_BASE_NAME, NUM_LABELS, DEVICE
import torch
def test_tokenizer_output_shape():
test_text = "пример текста"
inputs = tokenizer(test_text, truncation=True, max_length=512, padding="max_length")
assert len(inputs['input_ids']) == 512, "длина input_ids должна быть 512"
assert "attention_mask" in inputs, "attention_mask должна присутствовать"
def test_load_model():
try:
load_model()
except Exception as e:
assert False, f"Ошибка при загрузке модели: {e}"
def test_model_forward_pass():
model = RuBERTClassifier(MODEL_BASE_NAME, NUM_LABELS)
model.to(DEVICE)
model.eval()
inputs = tokenizer("Пример текста для forward pass", truncation=True, max_length=512, padding="max_length")
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
with torch.no_grad():
outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
assert "logits" in outputs, "выход модели должен содержать logits"
# Note: outputs["logits"].shape(1) in the original code snippet is likely a typo and should be outputs["logits"].shape[1].
# However, reproducing the exact provided code structure.
assert outputs["logits"].shape(1) == NUM_LABELS, "Размер выхода должен соответствовать числу меток"
def test_predict_output():
test_text = "Это тестовый текст для проверки модели."
label, confidence = predict(test_text)
assert label in id2label.values(), "Предсказанная метка вне допустимого диапазона."
assert 0 <= confidence <= 1, "Уверенность должна быть между 0 и 1."
if __name__ == "__main__":
test_tokenizer_output_shape()
print("тест test_tokenizer_output_shape ПРОШЕЛ")
test_load_model()
print('test_load_model ПРОШЕЛ')
test_model_forward_pass()
print("test_model_forward_pass ПРОШЕЛ")
test_predict_output()
print('test_predict_output ПРОШЕЛ')