| from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig, AutoModel |
| from datasets import Dataset, DatasetDict, Features, Sequence, ClassLabel, Value |
| import pandas as pd |
| import re |
| import torch |
| from Preprocess import * |
|
|
| model_path = "Model-V1.1" |
| tokenizer = AutoTokenizer.from_pretrained(model_path) |
| config = AutoConfig.from_pretrained(model_path) |
| model = AutoModelForSequenceClassification.from_pretrained(model_path, config=config) |
| labels = model.config.id2label |
| label2id = model.config.label2id |
|
|
| def preprocess(code): |
| |
| code = remove_comments(code) |
| code = replace_preprocessor(code) |
| code = normalize_braces(code) |
| code = strip_lines(code) |
| return code |
|
|
|
|
| def eval(source): |
| source = preprocess(source) |
| inputs = tokenizer( |
| source, |
| truncation=True, |
| padding='max_length', |
| max_length=512, |
| return_tensors='pt' |
| ) |
|
|
| model.cpu() |
| model.eval() |
| inputs = {k: v.cpu() for k, v in inputs.items()} |
|
|
| with torch.no_grad(): |
| outputs = model(**inputs) |
|
|
| probs = torch.softmax(outputs.logits, dim=-1).detach().cpu().numpy()[0] |
| pred_id = probs.argmax() |
| |
| return labels[pred_id], f"{probs[label2id['AI']]*100:.2f} %" |
|
|