from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from safetensors.torch import load_file
import os
MODEL_BASE_NAME = "DeepPavlov/rubert-base-cased"
MODEL_DIR = "model_save_dir"
NUM_LABELS = 8
DEVICE = torch.device('cuda')
class RuBERTClassifier(nn.Module):
def __init__(self, base_model_name, num_labels, dropout_rate=0.3):
super().__init__()
self.num_labels = num_labels
self.bert = AutoModel.from_pretrained(base_model_name)
hidden_size = self.bert.config.hidden_size
self.dropout1 = nn.Dropout(dropout_rate)
self.fc1 = nn.Linear(hidden_size, 256)
self.relu = nn.ReLU()
self.dropout2 = nn.Dropout(dropout_rate)
self.classifier = nn.Linear(256, num_labels)
self.loss_fnc = CrossEntropyLoss()
def forward(self, input_ids=None, attention_mask=None, labels=None):
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
cls = outputs.last_hidden_state[:, 0, :]
x = self.dropout1(cls)
x = self.fc1(x)
x = self.relu(x)
x = self.dropout2(x)
logits = self.classifier(x)
loss = None
if labels is not None:
loss = self.loss_fnc(logits.view(-1, self.num_labels), labels.view(-1))
return {"loss": loss, "logits": logits}
tokenizer = AutoTokenizer.from_pretrained(MODEL_BASE_NAME)
def tokenize_func(examples):
return tokenizer(examples['text'], truncation=True, max_length=512, padding="max_length")
model = RuBERTClassifier(MODEL_BASE_NAME, NUM_LABELS)
def load_model():
state_dict = load_file(os.path.join(MODEL_DIR, "model.safetensors"))
model.load_state_dict(state_dict)
model.to(DEVICE)
model.eval()
load_model()
id2label = {0:'A', 1:'B', 2:'C', 3:'D', 4:'E', 5:'F', 6:'G', 7:'H'}
def predict(text):
inputs = tokenizer(text, truncation=True, max_length=512, padding="max_length", return_tensors="pt")
inputs = {k:v.to(DEVICE) for k, v in inputs.items()}
with torch.no_grad():
logits = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])["logits"]
probabilities = torch.softmax(logits, dim=1)
confidence, predicted_id = torch.max(probabilities, dim=1)
return id2label.get(predicted_id.item()), confidence.item()
from module_b import tokenizer, predict, load_model, id2label, RuBERTClassifier, MODEL_BASE_NAME, NUM_LABELS, DEVICE
import torch
def test_tokenizer_output_shape():
test_text = "пример текста"
inputs = tokenizer(test_text, truncation=True, max_length=512, padding="max_length")
assert len(inputs['input_ids']) == 512, "длина input_ids должна быть 512"
assert "attention_mask" in inputs, "attention_mask должна присутствовать"
def test_load_model():
try:
load_model()
except Exception as e:
assert False, f"Ошибка при загрузке модели: {e}"
def test_model_forward_pass():
model = RuBERTClassifier(MODEL_BASE_NAME, NUM_LABELS)
model.to(DEVICE)
model.eval()
inputs = tokenizer("Пример текста для forward pass", truncation=True, max_length=512, padding="max_length")
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
with torch.no_grad():
outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
assert "logits" in outputs, "выход модели должен содержать logits"
# Note: outputs["logits"].shape(1) in the original code snippet is likely a typo and should be outputs["logits"].shape[1].
# However, reproducing the exact provided code structure.
assert outputs["logits"].shape(1) == NUM_LABELS, "Размер выхода должен соответствовать числу меток"
def test_predict_output():
test_text = "Это тестовый текст для проверки модели."
label, confidence = predict(test_text)
assert label in id2label.values(), "Предсказанная метка вне допустимого диапазона."
assert 0 <= confidence <= 1, "Уверенность должна быть между 0 и 1."
if __name__ == "__main__":
test_tokenizer_output_shape()
print("тест test_tokenizer_output_shape ПРОШЕЛ")
test_load_model()
print('test_load_model ПРОШЕЛ')
test_model_forward_pass()
print("test_model_forward_pass ПРОШЕЛ")
test_predict_output()
print('test_predict_output ПРОШЕЛ')