LLMSniffer

Sleeping

App Files Files Community

Abir Muhtasim commited on 19 days ago

Commit

c52c03c

1 Parent(s): 3018397

upload files

Browse files

Files changed (5) hide show

app.py +52 -0
backend_model.py +118 -0
models/java_classifier.pth +3 -0
models/python_classifier.pth +3 -0
requirements.txt +68 -0

app.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import streamlit as st
+from backend_model import load_model_and_tokenizer, infer_single_sample
+java_model_architecture = 'microsoft/graphcodebert-base'
+java_model_path = 'models/java_classifier.pth'
+python_model_architecture = 'microsoft/graphcodebert-base'
+python_model_path = 'models/python_classifier.pth'
+@st.cache_resource
+def load_model(arch, path):
+    return  load_model_and_tokenizer(arch, path)
+st.title('LLM Sniffer')
+# form
+with st.form(key='my_form'):
+    # select language - java or python
+    language = st.selectbox(
+        label="Select Language",
+        options=["Java", "Python"],
+        key="language"
+    )
+    # text area
+    code = st.text_area(label="", value="", label_visibility="hidden", height=300, placeholder="Paste your code here", key="code")
+    # submit button
+    submit_button = st.form_submit_button(label='Submit')
+    if submit_button:
+        if code:
+            if language == "Java":
+                model, tokenizer = load_model(java_model_architecture, java_model_path)
+            else:
+                model, tokenizer = load_model(python_model_architecture, python_model_path)
+            result = infer_single_sample(
+                code_text=code,
+                model=model,
+                tokenizer=tokenizer,
+                language=language
+            )
+            st.write(result)

backend_model.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import torch
+import torch.nn as nn
+import torch.optim as optim
+import re
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, Trainer, TrainingArguments
+from torch.utils.data import DataLoader, Dataset
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# device = torch.device('cpu')
+def remove_java_comments(code):
+    # Remove single-line comments (//)
+    code = re.sub(r'//.*', '', code)
+    # Remove multi-line comments (/* ... */)
+    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
+    return code
+def remove_python_comments(code):
+    # Remove single-line comments (#)
+    code = re.sub(r'#.*', '', code)
+    # Remove multi-line comments (""" ... """ or ''' ... ''')
+    code = re.sub(r'""".*?"""', '', code, flags=re.DOTALL)
+    code = re.sub(r"'''.*?'''", '', code, flags=re.DOTALL)
+    return code
+# Model with Binary Classifier
+class CodeBERTBinaryClassifier(nn.Module):
+    def __init__(self, encoder_model, hidden_size=256, num_layers=2):
+        super(CodeBERTBinaryClassifier, self).__init__()
+        self.encoder = encoder_model
+        self.classifier = nn.Sequential(
+            nn.Dropout(0.3),  # Dropout with 30%
+            nn.Linear(self.encoder.config.hidden_size, 128),  # Hidden layer with 128 units
+            nn.BatchNorm1d(128),  # Batch normalization for the hidden layer
+            nn.ReLU(),  # ReLU activation for the hidden layer
+            nn.Dropout(0.3),  # Dropout with 30%
+            nn.Linear(128, 1)  # Output layer with 1 unit
+        )
+    def forward(self, input_ids, attention_mask):
+        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
+        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS] token representation
+        logits = self.classifier(cls_output.detach()).squeeze(-1)  # Squeeze for binary logit
+        return logits, cls_output
+def infer_single_sample(code_text, model, tokenizer, language='java'):
+    # Ensure model is in evaluation mode
+    model.eval()
+    # Remove comments from the code (assuming the same preprocessing as during training)
+    if language == 'python':
+        code_text = remove_python_comments(code_text)
+    else:
+        code_text = remove_java_comments(code_text)
+    # print(code_text)
+    # Tokenize the input
+    inputs = tokenizer.encode_plus(
+        code_text,
+        padding='max_length',
+        max_length=512,
+        truncation=True,
+        return_tensors='pt'
+    )
+    # Move inputs to the specified device
+    input_ids = inputs['input_ids'].to(device)
+    attention_mask = inputs['attention_mask'].to(device)
+    # Disable gradient computation for inference
+    with torch.no_grad():
+        # Get model prediction
+        logits, _ = model(input_ids, attention_mask)
+        # Apply sigmoid to get probability
+        probability = torch.sigmoid(logits).cpu().item()
+        # Classify based on 0.5 threshold
+        predicted_label = 1 if probability > 0.5 else 0
+    return {
+        'probability': probability,
+        'predicted_label': predicted_label,
+        'interpretation': 'GPT-generated' if predicted_label == 0 else 'Human-written'
+    }
+def load_model_and_tokenizer(model_architecture, model_path):
+    tokenizer = AutoTokenizer.from_pretrained(model_architecture)
+    base_model = AutoModel.from_pretrained(model_architecture)
+    model = CodeBERTBinaryClassifier(base_model)
+    # model = model.to(device)
+    # Load the model
+    # model = CodeBERTBinaryClassifier(base_model)
+    model.load_state_dict(torch.load(model_path))
+    model = model.to(device)
+    return model, tokenizer

models/java_classifier.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9cb0fa45719a7bddfc6fbe3e64eb1f83b41bbfba4e17d74370b8da1b02036341
+size 499068174

models/python_classifier.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6925f2f2faaf6d80ad821b4f2593630ef00271363ff39ebb28f2e5b7f2657948
+size 499068174

requirements.txt ADDED Viewed

	@@ -0,0 +1,68 @@

+altair==5.5.0
+attrs==24.2.0
+blinker==1.9.0
+cachetools==5.5.0
+certifi==2024.8.30
+charset-normalizer==3.4.0
+click==8.1.7
+filelock==3.16.1
+fsspec==2024.10.0
+gitdb==4.0.11
+GitPython==3.1.43
+huggingface-hub==0.26.5
+idna==3.10
+Jinja2==3.1.4
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+mpmath==1.3.0
+narwhals==1.16.0
+networkx==3.4.2
+numpy==2.2.0
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+packaging==24.2
+pandas==2.2.3
+pillow==11.0.0
+protobuf==5.29.1
+pyarrow==18.1.0
+pydeck==0.9.1
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+pytz==2024.2
+PyYAML==6.0.2
+referencing==0.35.1
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.22.3
+safetensors==0.4.5
+setuptools==75.6.0
+six==1.17.0
+smmap==5.0.1
+streamlit==1.40.2
+sympy==1.13.1
+tenacity==9.0.0
+tokenizers==0.21.0
+toml==0.10.2
+torch==2.5.1
+tornado==6.4.2
+tqdm==4.67.1
+transformers==4.47.0
+triton==3.1.0
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+watchdog==6.0.0