hackerbyhobby commited on
Commit
def395e
·
unverified ·
1 Parent(s): 88a5e15

updated app and other files

Browse files
Files changed (4) hide show
  1. app.py +83 -71
  2. other_scam_keywords.txt +9 -0
  3. requirements.txt +2 -1
  4. smishing_keywords.txt +12 -0
app.py CHANGED
@@ -4,99 +4,104 @@ from PIL import Image
4
  from transformers import pipeline
5
  import re
6
 
7
- # 1. Load scam keywords from file
8
- # Each line in 'scam_keywords.txt' is treated as a separate keyword.
9
- with open("scam_keywords.txt", "r", encoding="utf-8") as f:
10
- SCAM_KEYWORDS = [line.strip().lower() for line in f if line.strip()]
11
 
12
- # 2. Zero-Shot Classification Pipeline
 
 
 
13
  model_name = "joeddav/xlm-roberta-large-xnli"
14
  classifier = pipeline("zero-shot-classification", model=model_name)
15
 
 
16
  CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
17
 
18
- def keyword_and_url_boost(probabilities, text):
19
  """
20
- Adjust final probabilities if certain scam-related keywords or URLs appear.
21
- - probabilities: dict, label -> original probability
22
- - text: the combined text from user input + OCR
23
-
24
- Returns an updated dict of probabilities that sum to 1.
25
  """
26
  lower_text = text.lower()
27
 
28
- # 1. Check scam keywords
29
- keyword_count = sum(1 for kw in SCAM_KEYWORDS if kw in lower_text)
30
- keyword_boost = 0.50 * keyword_count # 5% per found keyword
31
- keyword_boost = min(keyword_boost, 0.30) # cap at +30%
 
 
 
 
32
 
33
- # 2. Check if there's any URL (simple regex for http/https)
34
  found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
35
- url_boost = 0.0
36
  if found_urls:
37
- # For demonstration: a flat +30% if a URL is found
38
- url_boost = 0.30
39
-
40
- # 3. Combine total boost
41
- total_boost = keyword_boost + url_boost
42
- total_boost = min(total_boost, 0.80) # cap at +80%
43
-
44
- if total_boost <= 0:
45
- return probabilities # no change if no keywords/URLs found
46
-
47
- smishing_prob = probabilities["SMiShing"]
48
- other_scam_prob = probabilities["Other Scam"]
49
- legit_prob = probabilities["Legitimate"]
50
-
51
- # 4. Distribute the total boost equally to "SMiShing" and "Other Scam"
52
- half_boost = total_boost / 2.0
53
- smishing_boosted = smishing_prob + url_boost
54
- other_scam_boosted = other_scam_prob + keyword_boost
55
- legit_boosted = legit_prob
56
-
57
- # 5. Re-normalize so they sum to 1
58
- total = smishing_boosted + other_scam_boosted + legit_boosted
 
 
 
59
  if total > 0:
60
- smishing_final = smishing_boosted / total
61
- other_scam_final = other_scam_boosted / total
62
- legit_final = legit_boosted / total
63
  else:
64
- smishing_final = 0.0
65
- other_scam_final = 0.0
66
- legit_final = 1.0
67
 
68
  return {
69
- "SMiShing": smishing_final,
70
- "Other Scam": other_scam_final,
71
- "Legitimate": legit_final
72
  }
73
 
74
  def smishing_detector(text, image):
75
  """
76
- 1. Extract text from the image (OCR) if provided.
77
- 2. Combine with user-entered text.
78
- 3. Zero-shot classification -> base probabilities.
79
- 4. Keyword + URL boost -> adjusted probabilities.
80
- 5. Return final label, confidence, etc.
81
  """
82
- # Step 1: OCR if there's an image
83
- combined_text = text if text else ""
84
  if image is not None:
85
  ocr_text = pytesseract.image_to_string(image, lang="spa+eng")
86
  combined_text += " " + ocr_text
87
-
88
- # Clean text
89
  combined_text = combined_text.strip()
 
90
  if not combined_text:
91
  return {
92
  "text_used_for_classification": "(none)",
93
  "label": "No text provided",
94
  "confidence": 0.0,
95
- "keywords_found": [],
 
96
  "urls_found": []
97
  }
98
 
99
- # Step 2: Zero-shot classification
100
  result = classifier(
101
  sequences=combined_text,
102
  candidate_labels=CANDIDATE_LABELS,
@@ -104,25 +109,29 @@ def smishing_detector(text, image):
104
  )
105
  original_probs = dict(zip(result["labels"], result["scores"]))
106
 
107
- # Step 3: Keyword + URL boost
108
- boosted_probs = keyword_and_url_boost(original_probs, combined_text)
109
-
110
- # Step 4: Pick final label after boost
111
  final_label = max(boosted_probs, key=boosted_probs.get)
112
  final_confidence = round(boosted_probs[final_label], 3)
113
 
114
- # Step 5: Identify which keywords and URLs were found
115
  lower_text = combined_text.lower()
116
- found_keywords = [kw for kw in SCAM_KEYWORDS if kw in lower_text]
 
117
  found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
118
 
119
  return {
120
  "text_used_for_classification": combined_text,
121
- "original_probabilities": {k: round(v, 3) for k, v in original_probs.items()},
122
- "boosted_probabilities": {k: round(v, 3) for k, v in boosted_probs.items()},
 
 
 
 
123
  "label": final_label,
124
  "confidence": final_confidence,
125
- "keywords_found": found_keywords,
 
126
  "urls_found": found_urls,
127
  }
128
 
@@ -140,11 +149,14 @@ demo = gr.Interface(
140
  )
141
  ],
142
  outputs="json",
143
- title="SMiShing & Scam Detector (Keyword + URL Boost)",
144
  description="""
145
  This tool classifies messages as SMiShing, Other Scam, or Legitimate using a zero-shot model
146
- (joeddav/xlm-roberta-large-xnli). It also checks for certain "scam keywords" (loaded from a file)
147
- and any URLs, boosting the probability of a scam label if found.
 
 
 
148
  Supports English & Spanish text (OCR included).
149
  """,
150
  allow_flagging="never"
 
4
  from transformers import pipeline
5
  import re
6
 
7
+ # 1. Load keywords from separate files
8
+ with open("smishing_keywords.txt", "r", encoding="utf-8") as f:
9
+ SMISHING_KEYWORDS = [line.strip().lower() for line in f if line.strip()]
 
10
 
11
+ with open("other_scam_keywords.txt", "r", encoding="utf-8") as f:
12
+ OTHER_SCAM_KEYWORDS = [line.strip().lower() for line in f if line.strip()]
13
+
14
+ # 2. Load the zero-shot classification pipeline
15
  model_name = "joeddav/xlm-roberta-large-xnli"
16
  classifier = pipeline("zero-shot-classification", model=model_name)
17
 
18
+ # We will classify among these three labels
19
  CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
20
 
21
+ def boost_probabilities(probabilities: dict, text: str) -> dict:
22
  """
23
+ Increases SMiShing probability if 'smishing_keywords' or URLs are found.
24
+ Increases Other Scam probability if 'other_scam_keywords' are found.
25
+ Reduces Legitimate by the total amount of these boosts.
26
+ Then clamps negative probabilities to 0 and re-normalizes.
 
27
  """
28
  lower_text = text.lower()
29
 
30
+ # Count smishing keywords
31
+ smishing_keyword_count = sum(1 for kw in SMISHING_KEYWORDS if kw in lower_text)
32
+ # Count other scam keywords
33
+ other_scam_keyword_count = sum(1 for kw in OTHER_SCAM_KEYWORDS if kw in lower_text)
34
+
35
+ # Base boosts
36
+ smishing_boost = 0.10 * smishing_keyword_count
37
+ other_scam_boost = 0.10 * other_scam_keyword_count
38
 
39
+ # Check URLs => +0.20 only to Smishing
40
  found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
 
41
  if found_urls:
42
+ smishing_boost += 0.20
43
+
44
+ # Extract original probabilities
45
+ p_smishing = probabilities["SMiShing"]
46
+ p_other_scam = probabilities["Other Scam"]
47
+ p_legit = probabilities["Legitimate"]
48
+
49
+ # Apply boosts
50
+ p_smishing += smishing_boost
51
+ p_other_scam += other_scam_boost
52
+
53
+ # Subtract total boost from Legitimate
54
+ total_boost = smishing_boost + other_scam_boost
55
+ p_legit -= total_boost
56
+
57
+ # Clamp negative probabilities
58
+ if p_smishing < 0:
59
+ p_smishing = 0.0
60
+ if p_other_scam < 0:
61
+ p_other_scam = 0.0
62
+ if p_legit < 0:
63
+ p_legit = 0.0
64
+
65
+ # Re-normalize so sum=1
66
+ total = p_smishing + p_other_scam + p_legit
67
  if total > 0:
68
+ p_smishing /= total
69
+ p_other_scam /= total
70
+ p_legit /= total
71
  else:
72
+ # fallback if everything is zero
73
+ p_smishing, p_other_scam, p_legit = 0.0, 0.0, 1.0
 
74
 
75
  return {
76
+ "SMiShing": p_smishing,
77
+ "Other Scam": p_other_scam,
78
+ "Legitimate": p_legit
79
  }
80
 
81
  def smishing_detector(text, image):
82
  """
83
+ 1. OCR if image provided.
84
+ 2. Zero-shot classify => base probabilities.
85
+ 3. Boost probabilities based on keywords + URL logic.
86
+ 4. Return final classification + confidence.
 
87
  """
88
+ combined_text = text or ""
 
89
  if image is not None:
90
  ocr_text = pytesseract.image_to_string(image, lang="spa+eng")
91
  combined_text += " " + ocr_text
 
 
92
  combined_text = combined_text.strip()
93
+
94
  if not combined_text:
95
  return {
96
  "text_used_for_classification": "(none)",
97
  "label": "No text provided",
98
  "confidence": 0.0,
99
+ "smishing_keywords_found": [],
100
+ "other_scam_keywords_found": [],
101
  "urls_found": []
102
  }
103
 
104
+ # Perform zero-shot classification
105
  result = classifier(
106
  sequences=combined_text,
107
  candidate_labels=CANDIDATE_LABELS,
 
109
  )
110
  original_probs = dict(zip(result["labels"], result["scores"]))
111
 
112
+ # Apply boosts
113
+ boosted_probs = boost_probabilities(original_probs, combined_text)
 
 
114
  final_label = max(boosted_probs, key=boosted_probs.get)
115
  final_confidence = round(boosted_probs[final_label], 3)
116
 
117
+ # For display: which keywords + URLs
118
  lower_text = combined_text.lower()
119
+ smishing_found = [kw for kw in SMISHING_KEYWORDS if kw in lower_text]
120
+ other_scam_found = [kw for kw in OTHER_SCAM_KEYWORDS if kw in lower_text]
121
  found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
122
 
123
  return {
124
  "text_used_for_classification": combined_text,
125
+ "original_probabilities": {
126
+ k: round(v, 3) for k, v in original_probs.items()
127
+ },
128
+ "boosted_probabilities": {
129
+ k: round(v, 3) for k, v in boosted_probs.items()
130
+ },
131
  "label": final_label,
132
  "confidence": final_confidence,
133
+ "smishing_keywords_found": smishing_found,
134
+ "other_scam_keywords_found": other_scam_found,
135
  "urls_found": found_urls,
136
  }
137
 
 
149
  )
150
  ],
151
  outputs="json",
152
+ title="SMiShing & Scam Detector (Separate Keywords + URL → SMiShing)",
153
  description="""
154
  This tool classifies messages as SMiShing, Other Scam, or Legitimate using a zero-shot model
155
+ (joeddav/xlm-roberta-large-xnli).
156
+ - 'smishing_keywords.txt' boosts SMiShing specifically.
157
+ - 'other_scam_keywords.txt' boosts Other Scam specifically.
158
+ - Any URL found further boosts ONLY Smishing.
159
+ - The total boost is subtracted from Legitimate.
160
  Supports English & Spanish text (OCR included).
161
  """,
162
  allow_flagging="never"
other_scam_keywords.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ love
2
+ urgent
3
+ help
4
+ lottery
5
+ winnings
6
+ prize
7
+ congratulations
8
+ gift
9
+ claim
requirements.txt CHANGED
@@ -4,4 +4,5 @@ torch==2.0.1
4
  sentencepiece==0.1.99
5
  pytesseract==0.3.10
6
  Pillow==9.5.0
7
- numpy==1.23.5
 
 
4
  sentencepiece==0.1.99
5
  pytesseract==0.3.10
6
  Pillow==9.5.0
7
+ tesseract-ocr
8
+ numpy==1.23.5
smishing_keywords.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ urgent
2
+ atm
3
+ password
4
+ bank
5
+ account
6
+ verify
7
+ http
8
+ .com
9
+ https
10
+ URL
11
+ copy
12
+ paste