boda
commited on
Commit
·
0e55bc2
1
Parent(s):
11fe992
init
Browse files- README.md +70 -0
- config.json +182 -0
- helpers/__pycache__/helper.cpython-39.pyc +0 -0
- helpers/download_model.py +32 -0
- helpers/helper.py +180 -0
- main.py +22 -0
- special_tokens_map.json +1 -0
- tokenizer_config.json +1 -0
- vocab.txt +0 -0
README.md
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
- ar
|
4 |
+
|
5 |
+
thumbnail: "url to a thumbnail used in social sharing"
|
6 |
+
tags:
|
7 |
+
- ner
|
8 |
+
- token-classification
|
9 |
+
- Arabic-NER
|
10 |
+
|
11 |
+
metrics:
|
12 |
+
- accuracy
|
13 |
+
- f1
|
14 |
+
- precision
|
15 |
+
- recall
|
16 |
+
|
17 |
+
widget:
|
18 |
+
- text: "النجم محمد صلاح لاعب المنتخب المصري يعيش في مصر بالتحديد من نجريج, الشرقية"
|
19 |
+
example_title: "Mohamed Salah"
|
20 |
+
- text: "انا ساكن في حدايق الزتون و بدرس في جامعه عين شمس"
|
21 |
+
example_title: "Egyptian Dialect"
|
22 |
+
- text: "يقع نهر الأمازون في قارة أمريكا الجنوبية"
|
23 |
+
example_title: "Standard Arabic"
|
24 |
+
|
25 |
+
datasets:
|
26 |
+
- Fine-grained-Arabic-Named-Entity-Corpora
|
27 |
+
---
|
28 |
+
|
29 |
+
# Arabic Named Entity Recognition
|
30 |
+
|
31 |
+
This project is made to enrich the Arabic Named Entity Recognition(ANER). Arabic is a tough language to deal with and has alot of difficulties.
|
32 |
+
We managed to made a model based on Arabert to support 50 entities.
|
33 |
+
|
34 |
+
## Paper
|
35 |
+
|
36 |
+
Here's the paper that contains all the details for our model, our approach, and the training results
|
37 |
+
|
38 |
+
- [ANER Paper](https://drive.google.com/file/d/1jJn3iWqOeLzaNvO-6aKfgidzJlWOtvti/view?usp=sharing)
|
39 |
+
|
40 |
+
# Usage
|
41 |
+
|
42 |
+
The model is available in HuggingFace model page under the name: [boda/ANER](https://huggingface.co/boda/ANER). Checkpoints are available only in PyTorch at the time.
|
43 |
+
|
44 |
+
### Use in python:
|
45 |
+
|
46 |
+
```python
|
47 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
48 |
+
|
49 |
+
tokenizer = AutoTokenizer.from_pretrained("boda/ANER")
|
50 |
+
|
51 |
+
model = AutoModelForTokenClassification.from_pretrained("boda/ANER")
|
52 |
+
```
|
53 |
+
|
54 |
+
# Dataset
|
55 |
+
|
56 |
+
- [Fine-grained Arabic Named Entity Corpora](https://fsalotaibi.kau.edu.sa/Pages-Arabic-NE-Corpora.aspx)
|
57 |
+
|
58 |
+
# Acknowledgments
|
59 |
+
|
60 |
+
Thanks for [Arabert](https://github.com/aub-mind/arabert) for providing the Arabic Bert model, which we used as a base model for our work.
|
61 |
+
|
62 |
+
We also would like to thank [Prof. Fahd Saleh S Alotaibi](https://fsalotaibi.kau.edu.sa/Pages-Arabic-NE-Corpora.aspx) at Faculty of Computing and Information Technology King Abdulaziz University, for providing the dataset which we used to train our model with.
|
63 |
+
|
64 |
+
# Contacts
|
65 |
+
|
66 |
+
**Abdelrahman Atef**
|
67 |
+
|
68 |
+
- [LinkedIn](linkedin.com/in/boda-sadalla)
|
69 |
+
- [Github](https://github.com/BodaSadalla98)
|
70 |
+
- <[email protected]>
|
config.json
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": ".",
|
3 |
+
"architectures": [
|
4 |
+
"BertForTokenClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"gradient_checkpointing": false,
|
8 |
+
"hidden_act": "gelu",
|
9 |
+
"hidden_dropout_prob": 0.1,
|
10 |
+
"hidden_size": 768,
|
11 |
+
"id2label": {
|
12 |
+
"0": "O",
|
13 |
+
"1": " \u0641\u0646\u0627\u0646 ",
|
14 |
+
"2": " \u0641\u0646\u0627\u0646 ",
|
15 |
+
"3": " \u0635\u0648\u062a ",
|
16 |
+
"4": " \u0635\u0648\u062a ",
|
17 |
+
"5": " \u062a\u0639\u0644\u064a\u0645\u064a ",
|
18 |
+
"6": " \u062a\u0639\u0644\u064a\u0645\u064a ",
|
19 |
+
"7": " \u0623\u0631\u0627\u0636\u064a \u0627\u0644\u0628\u0646\u0627\u0621 ",
|
20 |
+
"8": " \u0623\u0631\u0627\u0636\u064a \u0627\u0644\u0628\u0646\u0627\u0621 ",
|
21 |
+
"9": " \u0645\u0631\u0643\u0632 \u0633\u0643\u0646\u064a ",
|
22 |
+
"10": " \u0634\u0639\u0628(\u0623\u0645\u0629) ",
|
23 |
+
"11": " \u0648\u0644\u0627\u064a\u0629 \u0623\u0648 \u0645\u0642\u0627\u0637\u0639\u0629 ",
|
24 |
+
"12": " \u0648\u0644\u0627\u064a\u0629 \u0623\u0648 \u0645\u0642\u0627\u0637\u0639\u0629 ",
|
25 |
+
"13": " \u0645\u0633\u0637\u062d \u0645\u0627\u0626\u064a ",
|
26 |
+
"14": " \u0645\u0633\u0637\u062d \u0645\u0627\u0626\u064a ",
|
27 |
+
"15": " \u0623\u0631\u0636 \u0637\u0628\u064a\u0639\u064a\u0629 ",
|
28 |
+
"16": " \u0623\u0631\u0636 \u0637\u0628\u064a\u0639\u064a\u0629 ",
|
29 |
+
"17": " \u0633\u0648\u0641\u062a\u0648\u064a\u0631(\u0628\u0631\u0645\u062c\u064a\u0627\u062a) ",
|
30 |
+
"18": " \u0633\u0648\u0641\u062a\u0648\u064a\u0631(\u0628\u0631\u0645\u062c\u064a\u0627\u062a) ",
|
31 |
+
"19": " \u0639\u0627\u0644\u0645 ",
|
32 |
+
"20": " \u0643\u062a\u0627\u0628 ",
|
33 |
+
"21": " \u0643\u062a\u0627\u0628 ",
|
34 |
+
"22": " \u0639\u0627\u0644\u0645 ",
|
35 |
+
"23": " \u0645\u062c\u0645\u0648\u0639\u0629 ",
|
36 |
+
"24": " \u0633\u0645\u0627\u0648\u064a ",
|
37 |
+
"25": " \u0634\u0631\u0637\u0629 ",
|
38 |
+
"26": " \u0634\u0631\u0637\u0629 ",
|
39 |
+
"27": " \u0645\u0631\u0643\u0632 \u0633\u0643\u0646\u064a ",
|
40 |
+
"28": " \u0633\u0645\u0627\u0648\u064a ",
|
41 |
+
"29": " \u0645\u0647\u0646\u062f\u0633 ",
|
42 |
+
"30": " \u0645\u0647\u0646\u062f\u0633 ",
|
43 |
+
"31": " \u0642\u0630\u064a\u0641\u0629 ",
|
44 |
+
"32": " \u062d\u0643\u0648\u0645\u0629 ",
|
45 |
+
"33": " \u062d\u0643\u0648\u0645\u0629 ",
|
46 |
+
"34": " \u062a\u062c\u0627\u0631\u064a ",
|
47 |
+
"35": " \u062a\u062c\u0627\u0631\u064a ",
|
48 |
+
"36": " \u0642\u0627\u0631\u0629 ",
|
49 |
+
"37": " \u0647\u0648\u0627\u0621 ",
|
50 |
+
"38": " \u0647\u0648\u0627\u0621 ",
|
51 |
+
"39": " \u0634\u062e\u0635 ",
|
52 |
+
"40": " \u0634\u062e\u0635 ",
|
53 |
+
"41": " \u0645\u062c\u0645\u0648\u0639\u0629 ",
|
54 |
+
"42": " \u0633\u064a\u0627\u0633\u064a ",
|
55 |
+
"43": " \u0633\u064a\u0627\u0633\u064a ",
|
56 |
+
"44": " \u0631\u064a\u0627\u0636\u064a ",
|
57 |
+
"45": " \u0631\u064a\u0627\u0636\u064a ",
|
58 |
+
"46": " \u0645\u0624\u0633\u0633\u0629 \u062f\u064a\u0646\u064a\u0629 ",
|
59 |
+
"47": " \u0645\u0624\u0633\u0633\u0629 \u062f\u064a\u0646\u064a\u0629 ",
|
60 |
+
"48": " \u0637\u0631\u064a\u0642 ",
|
61 |
+
"49": " \u0637\u0631\u064a\u0642 ",
|
62 |
+
"50": " \u0625\u0639\u0644\u0627\u0645 ",
|
63 |
+
"51": " \u0625\u0639\u0644\u0627\u0645 ",
|
64 |
+
"52": " \u063a\u064a\u0631 \u062d\u0643\u0648\u0645\u064a ",
|
65 |
+
"53": " \u063a\u064a\u0631 \u062d\u0643\u0648\u0645\u064a ",
|
66 |
+
"54": " \u0645\u062f\u064a\u0646\u0629 \u0623\u0648 \u0636\u0627\u062d\u064a\u0629 ",
|
67 |
+
"55": " \u0645\u062f\u064a\u0646\u0629 \u0623\u0648 \u0636\u0627\u062d\u064a\u0629 ",
|
68 |
+
"56": " \u0631\u062c\u0644 \u0623\u0639\u0645\u0627\u0644 ",
|
69 |
+
"57": " \u0645\u062d\u0627\u0645\u064a ",
|
70 |
+
"58": " \u0645\u062d\u0627\u0645\u064a ",
|
71 |
+
"59": " ",
|
72 |
+
"60": " ",
|
73 |
+
"61": " \u0634\u0639\u0628(\u0623\u0645\u0629) ",
|
74 |
+
"62": " \u0634\u062e\u0635 \u062f\u064a\u0646\u064a ",
|
75 |
+
"63": " \u0634\u062e\u0635 \u062f\u064a\u0646\u064a ",
|
76 |
+
"64": " \u0631\u062c\u0644 \u0623\u0639\u0645\u0627\u0644 ",
|
77 |
+
"65": " \u0639\u0644\u0648\u0645 \u0637\u0628\u064a\u0629 ",
|
78 |
+
"66": " \u0639\u0644\u0648\u0645 \u0637\u0628\u064a\u0629 ",
|
79 |
+
"67": " \u0641\u064a\u0644\u0645 ",
|
80 |
+
"68": " \u0641\u064a\u0644\u0645 ",
|
81 |
+
"69": " \u0645\u0627\u0621 ",
|
82 |
+
"70": " \u0645\u0627\u0621 ",
|
83 |
+
"71": " \u062f\u0648\u0627\u0621 ",
|
84 |
+
"72": " \u0639\u062a\u0627\u062f ",
|
85 |
+
"73": " \u0639\u062a\u0627\u062f ",
|
86 |
+
"74": " \u0645\u0646\u0634\u0623\u0629 \u0645\u0646\u0637\u0642\u0629 \u0641\u0631\u0639\u064a\u0629 ",
|
87 |
+
"75": " \u0645\u0646\u0634\u0623\u0629 \u0645\u0646\u0637\u0642\u0629 \u0641\u0631\u0639\u064a\u0629 ",
|
88 |
+
"76": " \u0641\u0638 ",
|
89 |
+
"77": " \u0645\u0637\u0627\u0631 ",
|
90 |
+
"78": " \u0641\u0638 ",
|
91 |
+
"79": " \u062f\u0648\u0627\u0621 ",
|
92 |
+
"80": " \u0631\u064a\u0627\u0636\u0629 ",
|
93 |
+
"81": " \u0631\u064a\u0627\u0636\u0629 ",
|
94 |
+
"82": " \u0631\u0645\u0627\u064a\u0629 ",
|
95 |
+
"83": " \u0631\u0645\u0627\u064a\u0629 ",
|
96 |
+
"84": " \u0637\u0639\u0627\u0645 ",
|
97 |
+
"85": " \u0637\u0639\u0627\u0645 ",
|
98 |
+
"86": " \u0642\u0627\u0631\u0629 ",
|
99 |
+
"87": " \u0646\u0648\u0648\u064a ",
|
100 |
+
"88": " \u0646\u0648\u0648\u064a ",
|
101 |
+
"89": " \u062a\u0631\u0641\u064a\u0647 ",
|
102 |
+
"90": " \u062a\u0631\u0641\u064a\u0647 ",
|
103 |
+
"91": " \u0642\u0630\u064a\u0641\u0629 ",
|
104 |
+
"92": " \u0623\u0631\u0636 ",
|
105 |
+
"93": " \u062d\u0627\u062f ",
|
106 |
+
"94": " \u0645\u0637\u0627\u0631 ",
|
107 |
+
"95": " \u0623\u0631\u0636 ",
|
108 |
+
"96": " \u0646\u0628\u0627\u062a ",
|
109 |
+
"97": " \u0646\u0628\u0627\u062a ",
|
110 |
+
"98": " \u0645\u0646\u0641\u062c\u0631 ",
|
111 |
+
"99": " \u0645\u0646\u0641\u062c\u0631 ",
|
112 |
+
"100": " \u0643\u064a\u0645\u064a\u0627\u0626\u064a ",
|
113 |
+
"101": " \u0643\u064a\u0645\u064a\u0627\u0626\u064a "
|
114 |
+
},
|
115 |
+
"initializer_range": 0.02,
|
116 |
+
"intermediate_size": 3072,
|
117 |
+
"label2id": {
|
118 |
+
" ": 60,
|
119 |
+
" \u0623\u0631\u0627\u0636\u064a \u0627\u0644\u0628\u0646\u0627\u0621 ": 8,
|
120 |
+
" \u0623\u0631\u0636 ": 95,
|
121 |
+
" \u0623\u0631\u0636 \u0637\u0628\u064a\u0639\u064a\u0629 ": 16,
|
122 |
+
" \u0625\u0639\u0644\u0627\u0645 ": 51,
|
123 |
+
" \u062a\u062c\u0627\u0631\u064a ": 35,
|
124 |
+
" \u062a\u0631\u0641\u064a\u0647 ": 90,
|
125 |
+
" \u062a\u0639\u0644\u064a\u0645\u064a ": 6,
|
126 |
+
" \u062d\u0627\u062f ": 93,
|
127 |
+
" \u062d\u0643\u0648\u0645\u0629 ": 33,
|
128 |
+
" \u062f\u0648\u0627\u0621 ": 79,
|
129 |
+
" \u0631\u062c\u0644 \u0623\u0639\u0645\u0627\u0644 ": 64,
|
130 |
+
" \u0631\u0645\u0627\u064a\u0629 ": 83,
|
131 |
+
" \u0631\u064a\u0627\u0636\u0629 ": 81,
|
132 |
+
" \u0631\u064a\u0627\u0636\u064a ": 45,
|
133 |
+
" \u0633\u0645\u0627\u0648\u064a ": 28,
|
134 |
+
" \u0633\u0648\u0641\u062a\u0648\u064a\u0631(\u0628\u0631\u0645\u062c\u064a\u0627\u062a) ": 18,
|
135 |
+
" \u0633\u064a\u0627\u0633\u064a ": 43,
|
136 |
+
" \u0634\u062e\u0635 ": 40,
|
137 |
+
" \u0634\u062e\u0635 \u062f\u064a\u0646\u064a ": 63,
|
138 |
+
" \u0634\u0631\u0637\u0629 ": 26,
|
139 |
+
" \u0634\u0639\u0628(\u0623\u0645\u0629) ": 61,
|
140 |
+
" \u0635\u0648\u062a ": 4,
|
141 |
+
" \u0637\u0631\u064a\u0642 ": 49,
|
142 |
+
" \u0637\u0639\u0627\u0645 ": 85,
|
143 |
+
" \u0639\u0627\u0644\u0645 ": 22,
|
144 |
+
" \u0639\u062a\u0627\u062f ": 73,
|
145 |
+
" \u0639\u0644\u0648\u0645 \u0637\u0628\u064a\u0629 ": 66,
|
146 |
+
" \u063a\u064a\u0631 \u062d\u0643\u0648\u0645\u064a ": 53,
|
147 |
+
" \u0641\u0638 ": 78,
|
148 |
+
" \u0641\u0646\u0627\u0646 ": 2,
|
149 |
+
" \u0641\u064a\u0644\u0645 ": 68,
|
150 |
+
" \u0642\u0627\u0631\u0629 ": 86,
|
151 |
+
" \u0642\u0630\u064a\u0641\u0629 ": 91,
|
152 |
+
" \u0643\u062a\u0627\u0628 ": 21,
|
153 |
+
" \u0643\u064a\u0645\u064a\u0627\u0626\u064a ": 101,
|
154 |
+
" \u0645\u0624\u0633\u0633\u0629 \u062f\u064a\u0646\u064a\u0629 ": 47,
|
155 |
+
" \u0645\u0627\u0621 ": 70,
|
156 |
+
" \u0645\u062c\u0645\u0648\u0639\u0629 ": 41,
|
157 |
+
" \u0645\u062d\u0627\u0645\u064a ": 58,
|
158 |
+
" \u0645\u062f\u064a\u0646\u0629 \u0623\u0648 \u0636\u0627\u062d\u064a\u0629 ": 55,
|
159 |
+
" \u0645\u0631\u0643\u0632 \u0633\u0643\u0646\u064a ": 27,
|
160 |
+
" \u0645\u0633\u0637\u062d \u0645\u0627\u0626\u064a ": 14,
|
161 |
+
" \u0645\u0637\u0627\u0631 ": 94,
|
162 |
+
" \u0645\u0646\u0634\u0623\u0629 \u0645\u0646\u0637\u0642\u0629 \u0641\u0631\u0639\u064a\u0629 ": 75,
|
163 |
+
" \u0645\u0646\u0641\u062c\u0631 ": 99,
|
164 |
+
" \u0645\u0647\u0646\u062f\u0633 ": 30,
|
165 |
+
" \u0646\u0628\u0627\u062a ": 97,
|
166 |
+
" \u0646\u0648\u0648\u064a ": 88,
|
167 |
+
" \u0647\u0648\u0627\u0621 ": 38,
|
168 |
+
" \u0648\u0644\u0627\u064a\u0629 \u0623\u0648 \u0645\u0642\u0627\u0637\u0639\u0629 ": 12,
|
169 |
+
"O": 0
|
170 |
+
},
|
171 |
+
"layer_norm_eps": 1e-12,
|
172 |
+
"max_position_embeddings": 512,
|
173 |
+
"model_type": "bert",
|
174 |
+
"num_attention_heads": 12,
|
175 |
+
"num_hidden_layers": 12,
|
176 |
+
"pad_token_id": 0,
|
177 |
+
"position_embedding_type": "absolute",
|
178 |
+
"transformers_version": "4.5.0",
|
179 |
+
"type_vocab_size": 2,
|
180 |
+
"use_cache": true,
|
181 |
+
"vocab_size": 64000
|
182 |
+
}
|
helpers/__pycache__/helper.cpython-39.pyc
ADDED
Binary file (3.76 kB). View file
|
|
helpers/download_model.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
|
3 |
+
def download_file_from_google_drive(id, destination):
|
4 |
+
URL = "https://docs.google.com/uc?export=download"
|
5 |
+
|
6 |
+
session = requests.Session()
|
7 |
+
|
8 |
+
response = session.get(URL, params = { 'id' : id }, stream = True)
|
9 |
+
token = get_confirm_token(response)
|
10 |
+
|
11 |
+
if token:
|
12 |
+
params = { 'id' : id, 'confirm' : token }
|
13 |
+
response = session.get(URL, params = params, stream = True)
|
14 |
+
|
15 |
+
save_response_content(response, destination)
|
16 |
+
|
17 |
+
def get_confirm_token(response):
|
18 |
+
for key, value in response.cookies.items():
|
19 |
+
if key.startswith('download_warning'):
|
20 |
+
return value
|
21 |
+
|
22 |
+
return None
|
23 |
+
|
24 |
+
def save_response_content(response, destination):
|
25 |
+
CHUNK_SIZE = 32768
|
26 |
+
|
27 |
+
with open(destination, "wb") as f:
|
28 |
+
for chunk in response.iter_content(CHUNK_SIZE):
|
29 |
+
if chunk: # filter out keep-alive new chunks
|
30 |
+
f.write(chunk)
|
31 |
+
|
32 |
+
|
helpers/helper.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from numpy import string_
|
2 |
+
import re
|
3 |
+
|
4 |
+
en_to_ar_camel = {
|
5 |
+
'B-LOC' : 'مكان',
|
6 |
+
'B-ORG': 'مؤسسة',
|
7 |
+
'B-PERS': 'شخص',
|
8 |
+
'B-MISC': 'معنى بموضوعات متنوعة',
|
9 |
+
'I-LOC': 'مكان',
|
10 |
+
'I-ORG': 'مؤسسة',
|
11 |
+
'I-PERS': 'شحص',
|
12 |
+
'I-MISC': 'معنى بموضوعات متنوعة',
|
13 |
+
}
|
14 |
+
|
15 |
+
en_to_ar = {
|
16 |
+
"B-Artist" : "فنان",
|
17 |
+
"I-Artist" :"فنان",
|
18 |
+
"B-Sound": "صوت",
|
19 |
+
"I-Sound":"صوت",
|
20 |
+
"B-Educational": "تعليمي",
|
21 |
+
"I-Educational":"تعليمي",
|
22 |
+
"B-Building-Grounds":"أراضي البناء",
|
23 |
+
"I-Building-Grounds":"أراضي البناء",
|
24 |
+
"B-Population-Center":"مركز سكني",
|
25 |
+
"B-Nation":"شعب(أمة)",
|
26 |
+
"B-State-or-Province":"ولاية أو مقاطعة",
|
27 |
+
"I-State-or-Province": "ولاية أو مقاطعة",
|
28 |
+
"B-Water-Body": "مسطح مائي",
|
29 |
+
"I-Water-Body":"مسطح مائي",
|
30 |
+
"B-Land-Region-Natural": "أرض طبيعية",
|
31 |
+
"I-Land-Region-Natural":"أرض طبيعية",
|
32 |
+
"B-Software":"سوفتوير(برمجيات)",
|
33 |
+
"I-Software":"سوفتوير(برمجيات)",
|
34 |
+
"B-Scientist": "عالم",
|
35 |
+
"B-Book":"كتاب",
|
36 |
+
"I-Book":"كتاب",
|
37 |
+
"I-Scientist":"عالم",
|
38 |
+
"B-Group":"مجموعة",
|
39 |
+
"B-Celestial":"سماوي",
|
40 |
+
"B-Police":"شرطة",
|
41 |
+
"I-Police":"شرطة",
|
42 |
+
"I-Population-Center":"مركز سكني",
|
43 |
+
"I-Celestial":"سماوي",
|
44 |
+
"B-Engineer":"مهندس",
|
45 |
+
"I-Engineer":"مهندس",
|
46 |
+
"B-Projectile":"قذيفة",
|
47 |
+
"B-Government":"حكومة",
|
48 |
+
"I-Government":"حكومة",
|
49 |
+
"B-Commercial":"تجاري",
|
50 |
+
"I-Commercial":"تجاري",
|
51 |
+
"B-Continent":"قارة",
|
52 |
+
"B-Air":"هواء",
|
53 |
+
"I-Air":"هواء",
|
54 |
+
"B-Other_PER":"شخص",
|
55 |
+
"I-Other_PER":"شخص",
|
56 |
+
"I-Group":"مجموعة",
|
57 |
+
"B-Politician":"سياسي",
|
58 |
+
"I-Politician":"سياسي",
|
59 |
+
"B-Athlete":"رياضي",
|
60 |
+
"I-Athlete":"رياضي",
|
61 |
+
"B-Religious_ORG":"مؤسسة دينية",
|
62 |
+
"I-Religious_ORG":"مؤسسة دينية",
|
63 |
+
"B-Path":"طريق",
|
64 |
+
"I-Path":"طريق",
|
65 |
+
"B-Media":"إعلام",
|
66 |
+
"I-Media":"إعلام",
|
67 |
+
"B-Non-Governmental":"غير حكومي",
|
68 |
+
"I-Non-Governmental":"غير حكومي",
|
69 |
+
"B-County-or-District":"مدينة أو ضاحية",
|
70 |
+
"I-County-or-District":"مدينة أو ضاحية",
|
71 |
+
"B-Businessperson":"رجل أعمال",
|
72 |
+
"B-Lawyer":"محامي",
|
73 |
+
"I-Lawyer":"محامي",
|
74 |
+
"B-GPE-Cluster":"",
|
75 |
+
"I-GPE-Cluster":"",
|
76 |
+
"I-Nation":"شعب(أمة)",
|
77 |
+
"B-Religious_PER":"شخص ديني",
|
78 |
+
"I-Religious_PER":"شخص ديني",
|
79 |
+
"I-Businessperson":"رجل أعمال",
|
80 |
+
"B-Medical-Science":"علوم طبية",
|
81 |
+
"I-Medical-Science":"علوم طبية",
|
82 |
+
"B-Movie":"فيلم",
|
83 |
+
"I-Movie":"فيلم",
|
84 |
+
"B-Water":"ماء",
|
85 |
+
"I-Water":"ماء",
|
86 |
+
"B-Drug":"دواء",
|
87 |
+
"B-Hardware":"عتاد",
|
88 |
+
"I-Hardware":"عتاد",
|
89 |
+
"B-Subarea-Facility":"منشأة منطقة فرعية",
|
90 |
+
"I-Subarea-Facility":"منشأة منطقة فرعية",
|
91 |
+
"B-Blunt":"فظ",
|
92 |
+
"B-Airport":"مطار",
|
93 |
+
"I-Blunt": "فظ",
|
94 |
+
"I-Drug":"دواء",
|
95 |
+
"B-Sports":"رياضة",
|
96 |
+
"I-Sports":"رياضة",
|
97 |
+
"B-Shooting":"رماية",
|
98 |
+
"I-Shooting":"رماية",
|
99 |
+
"B-Food":"طعام",
|
100 |
+
"I-Food":"طعام",
|
101 |
+
"I-Continent":"قارة",
|
102 |
+
"B-Nuclear":"نووي",
|
103 |
+
"I-Nuclear":"نووي",
|
104 |
+
"B-Entertainment":"ترفيه",
|
105 |
+
"I-Entertainment":"ترفيه",
|
106 |
+
"I-Projectile":"قذيفة",
|
107 |
+
"B-Land":"أرض",
|
108 |
+
"B-Sharp":"حاد",
|
109 |
+
"I-Airport":"مطار",
|
110 |
+
"I-Land":"أرض",
|
111 |
+
"B-Plant":"نبات",
|
112 |
+
"I-Plant":"نبات",
|
113 |
+
"B-Exploding":"منفجر",
|
114 |
+
"I-Exploding":"منفجر",
|
115 |
+
"B-Chemical":"كيميائي",
|
116 |
+
"I-Chemical": "كيميائي",
|
117 |
+
}
|
118 |
+
|
119 |
+
|
120 |
+
|
121 |
+
|
122 |
+
def get_separate_entities(labels, tokens):
|
123 |
+
"""
|
124 |
+
takes labels and token , return full name entity (mohamed, salah --> "mohamed salah")
|
125 |
+
this will be used to search in wikipedia
|
126 |
+
"""
|
127 |
+
res = []
|
128 |
+
b_before = False
|
129 |
+
temp = ""
|
130 |
+
key_value = ()
|
131 |
+
for i in range(len(labels)):
|
132 |
+
print(res)
|
133 |
+
curr = labels[i]
|
134 |
+
|
135 |
+
if("B-" in curr):
|
136 |
+
if(b_before):
|
137 |
+
key_value = (temp[:-1], 1)
|
138 |
+
res.append(key_value)
|
139 |
+
temp = tokens[i] + ' '
|
140 |
+
else:
|
141 |
+
b_before = True
|
142 |
+
temp += tokens[i] + ' '
|
143 |
+
if(i == len(labels)-1):
|
144 |
+
key_value = (temp[:-1], 1)
|
145 |
+
res.append(key_value)
|
146 |
+
# print("temp is:" + str(temp))
|
147 |
+
|
148 |
+
elif("I-" in curr):
|
149 |
+
temp += tokens[i] + ' '
|
150 |
+
if(i == len(labels)-1):
|
151 |
+
key_value = (temp[:-1], 1)
|
152 |
+
res.append(key_value)
|
153 |
+
|
154 |
+
else:
|
155 |
+
if(temp == ""):
|
156 |
+
key_value = (tokens[i], 0)
|
157 |
+
res.append(key_value)
|
158 |
+
else:
|
159 |
+
key_value = (temp[:-1], 1)
|
160 |
+
res.append(key_value)
|
161 |
+
key_value = (tokens[i], 0)
|
162 |
+
res.append(key_value)
|
163 |
+
temp = ""
|
164 |
+
b_before = False
|
165 |
+
|
166 |
+
|
167 |
+
|
168 |
+
print(res)
|
169 |
+
return res
|
170 |
+
|
171 |
+
|
172 |
+
|
173 |
+
|
174 |
+
|
175 |
+
|
176 |
+
|
177 |
+
|
178 |
+
|
179 |
+
|
180 |
+
|
main.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoModelForTokenClassification, AutoTokenizer
|
2 |
+
from helpers import helper
|
3 |
+
|
4 |
+
|
5 |
+
MODEL_NAME = 'boda/ANER'
|
6 |
+
|
7 |
+
|
8 |
+
# Load model and tokenizer
|
9 |
+
model = AutoModelForTokenClassification.from_pretrained('.')
|
10 |
+
tokenizer = AutoTokenizer.from_pretrained('.')
|
11 |
+
|
12 |
+
# change in the model labels
|
13 |
+
# model.config.id2label = {i: ' '+v+' ' for i, v in model.config.id2label.items() if i != 0 }
|
14 |
+
# model.config.id2label[0] = 'O'
|
15 |
+
# model.config.label2id = {label: i for i, label in model.config.id2label.items()}
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
# save model after finish
|
21 |
+
# model.save_pretrained('.')
|
22 |
+
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "max_len": 512, "do_basic_tokenize": true, "never_split": ["[بريد]", "[مستخدم]", "[رابط]"], "special_tokens_map_file": null, "name_or_path": "aubmindlab/bert-base-arabertv02"}
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|