duoquote
commited on
Commit
·
d1aed23
1
Parent(s):
e50ef50
Update model files and tokenizer configuration
Browse files- README.md +36 -36
- config.json +66 -36
- model/config.json +66 -36
- model/model.safetensors +2 -2
- model/tokenizer.json +16 -2
- model/training_args.bin +1 -1
- predict.py +4 -9
- train.py +19 -16
README.md
CHANGED
@@ -30,55 +30,55 @@ The model is based on [dbmdz/bert-base-turkish-cased](https://huggingface.co/dbm
|
|
30 |
```
|
31 |
(g:\projects\address-extraction\venv) G:\projects\address-extraction>python predict.py
|
32 |
Osmangazi Mahallesi, Hoca Ahmet Yesevi Cd. No:34, 16050 Osmangazi/Bursa
|
33 |
-
Osmangazi Mahalle 98.
|
34 |
-
Hoca Ahmet Yesevi Cadde
|
35 |
-
34 Bina Numarası
|
36 |
-
16050 Posta Kodu
|
37 |
-
Osmangazi İlçe 98.
|
38 |
Bursa İl 99.21%
|
39 |
-
Average Score: 0.
|
40 |
Labels Found: 6
|
41 |
----------------------------------------------------------------------
|
42 |
Karşıyaka Mahallesi, Mavişehir Caddesi No: 91, Daire 4, 35540 Karşıyaka/İzmir
|
43 |
-
Karşıyaka Mahalle
|
44 |
-
Mavişehir Cadde
|
45 |
-
91 Bina Numarası
|
46 |
-
4
|
47 |
-
35540 Posta Kodu 98.
|
48 |
-
Karşıyaka İlçe
|
49 |
-
İzmir İl
|
50 |
-
Average Score: 0.
|
51 |
Labels Found: 7
|
52 |
----------------------------------------------------------------------
|
53 |
Selçuklu Mahallesi, Atatürk Bulvarı No: 55, 42050 Selçuklu/Konya
|
54 |
-
Selçuklu Mahalle 98.
|
55 |
-
Atatürk Cadde
|
56 |
-
55 Bina Numarası
|
57 |
-
42050 Posta Kodu 98.
|
58 |
-
Selçuklu İlçe
|
59 |
-
Konya İl 99.
|
60 |
-
Average Score: 0.
|
61 |
Labels Found: 6
|
62 |
----------------------------------------------------------------------
|
63 |
Alsancak Mahallesi, 1475. Sk. No:3, 35220 Konak/İzmir
|
64 |
-
Alsancak Mahalle 99.
|
65 |
-
1475 Sokak
|
66 |
-
3 Bina Numarası
|
67 |
-
35220 Posta Kodu
|
68 |
-
Konak İlçe
|
69 |
-
İzmir İl
|
70 |
-
Average Score: 0.
|
71 |
Labels Found: 6
|
72 |
----------------------------------------------------------------------
|
73 |
Kocatepe Mahallesi, Yaşam Caddesi 3. Sokak No:4, 06420 Bayrampaşa/İstanbul
|
74 |
-
Kocatepe Mahalle 99.
|
75 |
-
Yaşam Cadde
|
76 |
-
3 Sokak
|
77 |
-
4 Bina Numarası
|
78 |
-
06420 Posta Kodu
|
79 |
-
Bayrampaşa İlçe 98.
|
80 |
-
İstanbul İl 98.
|
81 |
-
Average Score: 0.
|
82 |
Labels Found: 7
|
83 |
----------------------------------------------------------------------
|
84 |
```
|
|
|
30 |
```
|
31 |
(g:\projects\address-extraction\venv) G:\projects\address-extraction>python predict.py
|
32 |
Osmangazi Mahallesi, Hoca Ahmet Yesevi Cd. No:34, 16050 Osmangazi/Bursa
|
33 |
+
Osmangazi Mahalle 98.80%
|
34 |
+
Hoca Ahmet Yesevi Cadde 98.55%
|
35 |
+
34 Bina Numarası 99.50%
|
36 |
+
16050 Posta Kodu 98.49%
|
37 |
+
Osmangazi İlçe 98.71%
|
38 |
Bursa İl 99.21%
|
39 |
+
Average Score: 0.9874102413654328
|
40 |
Labels Found: 6
|
41 |
----------------------------------------------------------------------
|
42 |
Karşıyaka Mahallesi, Mavişehir Caddesi No: 91, Daire 4, 35540 Karşıyaka/İzmir
|
43 |
+
Karşıyaka Mahalle 98.93%
|
44 |
+
Mavişehir Cadde 96.90%
|
45 |
+
91 Bina Numarası 99.25%
|
46 |
+
4 Bina Numarası 30.75%
|
47 |
+
35540 Posta Kodu 98.97%
|
48 |
+
Karşıyaka İlçe 98.84%
|
49 |
+
İzmir İl 98.86%
|
50 |
+
Average Score: 0.9173339426517486
|
51 |
Labels Found: 7
|
52 |
----------------------------------------------------------------------
|
53 |
Selçuklu Mahallesi, Atatürk Bulvarı No: 55, 42050 Selçuklu/Konya
|
54 |
+
Selçuklu Mahalle 98.53%
|
55 |
+
Atatürk Cadde 47.01%
|
56 |
+
55 Bina Numarası 99.49%
|
57 |
+
42050 Posta Kodu 98.78%
|
58 |
+
Selçuklu İlçe 98.74%
|
59 |
+
Konya İl 99.16%
|
60 |
+
Average Score: 0.9240859523415565
|
61 |
Labels Found: 6
|
62 |
----------------------------------------------------------------------
|
63 |
Alsancak Mahallesi, 1475. Sk. No:3, 35220 Konak/İzmir
|
64 |
+
Alsancak Mahalle 99.35%
|
65 |
+
1475 Sokak 97.71%
|
66 |
+
3 Bina Numarası 99.18%
|
67 |
+
35220 Posta Kodu 99.00%
|
68 |
+
Konak İlçe 98.90%
|
69 |
+
İzmir İl 98.95%
|
70 |
+
Average Score: 0.9881603717803955
|
71 |
Labels Found: 6
|
72 |
----------------------------------------------------------------------
|
73 |
Kocatepe Mahallesi, Yaşam Caddesi 3. Sokak No:4, 06420 Bayrampaşa/İstanbul
|
74 |
+
Kocatepe Mahalle 99.44%
|
75 |
+
Yaşam Cadde 92.45%
|
76 |
+
3 Sokak 70.61%
|
77 |
+
4 Bina Numarası 99.18%
|
78 |
+
06420 Posta Kodu 99.00%
|
79 |
+
Bayrampaşa İlçe 98.86%
|
80 |
+
İstanbul İl 98.90%
|
81 |
+
Average Score: 0.9558616995811462
|
82 |
Labels Found: 7
|
83 |
----------------------------------------------------------------------
|
84 |
```
|
config.json
CHANGED
@@ -9,46 +9,76 @@
|
|
9 |
"hidden_dropout_prob": 0.1,
|
10 |
"hidden_size": 768,
|
11 |
"id2label": {
|
12 |
-
"0": "
|
13 |
-
"1": "
|
14 |
-
"2": "
|
15 |
-
"3": "
|
16 |
-
"4": "
|
17 |
-
"5": "
|
18 |
-
"6": "
|
19 |
-
"7": "
|
20 |
-
"8": "
|
21 |
-
"9": "
|
22 |
-
"10": "
|
23 |
-
"11": "
|
24 |
-
"12": "
|
25 |
-
"13": "
|
26 |
-
"14": "
|
27 |
-
"15": "
|
28 |
-
"16": "
|
29 |
-
"17": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
},
|
31 |
"initializer_range": 0.02,
|
32 |
"intermediate_size": 3072,
|
33 |
"label2id": {
|
34 |
-
"Adres Detay":
|
35 |
-
"Bina Ad\u0131":
|
36 |
-
"Bina Numaras\u0131":
|
37 |
-
"Blok No":
|
38 |
-
"Bulvar":
|
39 |
-
"Cadde":
|
40 |
-
"Daire No":
|
41 |
-
"Kat":
|
42 |
-
"Mahalle":
|
43 |
-
"Posta Kodu":
|
44 |
-
"Site":
|
45 |
-
"Sokak":
|
46 |
-
"Yer Ad\u0131":
|
47 |
-
"
|
48 |
-
"
|
49 |
-
"\
|
50 |
-
"
|
51 |
-
"\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
},
|
53 |
"layer_norm_eps": 1e-12,
|
54 |
"max_position_embeddings": 512,
|
|
|
9 |
"hidden_dropout_prob": 0.1,
|
10 |
"hidden_size": 768,
|
11 |
"id2label": {
|
12 |
+
"0": "O",
|
13 |
+
"1": "B-\u00dclke",
|
14 |
+
"2": "I-\u00dclke",
|
15 |
+
"3": "B-\u0130l",
|
16 |
+
"4": "I-\u0130l",
|
17 |
+
"5": "B-\u0130l\u00e7e",
|
18 |
+
"6": "I-\u0130l\u00e7e",
|
19 |
+
"7": "B-Mahalle",
|
20 |
+
"8": "I-Mahalle",
|
21 |
+
"9": "B-Cadde",
|
22 |
+
"10": "I-Cadde",
|
23 |
+
"11": "B-Sokak",
|
24 |
+
"12": "I-Sokak",
|
25 |
+
"13": "B-Bina Ad\u0131",
|
26 |
+
"14": "I-Bina Ad\u0131",
|
27 |
+
"15": "B-Bina Numaras\u0131",
|
28 |
+
"16": "I-Bina Numaras\u0131",
|
29 |
+
"17": "B-Yer Ad\u0131",
|
30 |
+
"18": "I-Yer Ad\u0131",
|
31 |
+
"19": "B-Site",
|
32 |
+
"20": "I-Site",
|
33 |
+
"21": "B-Adres Detay",
|
34 |
+
"22": "I-Adres Detay",
|
35 |
+
"23": "B-Blok No",
|
36 |
+
"24": "I-Blok No",
|
37 |
+
"25": "B-Bulvar",
|
38 |
+
"26": "I-Bulvar",
|
39 |
+
"27": "B-Daire No",
|
40 |
+
"28": "I-Daire No",
|
41 |
+
"29": "B-Posta Kodu",
|
42 |
+
"30": "I-Posta Kodu",
|
43 |
+
"31": "B-Kat",
|
44 |
+
"32": "I-Kat"
|
45 |
},
|
46 |
"initializer_range": 0.02,
|
47 |
"intermediate_size": 3072,
|
48 |
"label2id": {
|
49 |
+
"B-Adres Detay": 21,
|
50 |
+
"B-Bina Ad\u0131": 13,
|
51 |
+
"B-Bina Numaras\u0131": 15,
|
52 |
+
"B-Blok No": 23,
|
53 |
+
"B-Bulvar": 25,
|
54 |
+
"B-Cadde": 9,
|
55 |
+
"B-Daire No": 27,
|
56 |
+
"B-Kat": 31,
|
57 |
+
"B-Mahalle": 7,
|
58 |
+
"B-Posta Kodu": 29,
|
59 |
+
"B-Site": 19,
|
60 |
+
"B-Sokak": 11,
|
61 |
+
"B-Yer Ad\u0131": 17,
|
62 |
+
"B-\u00dclke": 1,
|
63 |
+
"B-\u0130l": 3,
|
64 |
+
"B-\u0130l\u00e7e": 5,
|
65 |
+
"I-Adres Detay": 22,
|
66 |
+
"I-Bina Ad\u0131": 14,
|
67 |
+
"I-Bina Numaras\u0131": 16,
|
68 |
+
"I-Blok No": 24,
|
69 |
+
"I-Bulvar": 26,
|
70 |
+
"I-Cadde": 10,
|
71 |
+
"I-Daire No": 28,
|
72 |
+
"I-Kat": 32,
|
73 |
+
"I-Mahalle": 8,
|
74 |
+
"I-Posta Kodu": 30,
|
75 |
+
"I-Site": 20,
|
76 |
+
"I-Sokak": 12,
|
77 |
+
"I-Yer Ad\u0131": 18,
|
78 |
+
"I-\u00dclke": 2,
|
79 |
+
"I-\u0130l": 4,
|
80 |
+
"I-\u0130l\u00e7e": 6,
|
81 |
+
"O": 0
|
82 |
},
|
83 |
"layer_norm_eps": 1e-12,
|
84 |
"max_position_embeddings": 512,
|
model/config.json
CHANGED
@@ -9,46 +9,76 @@
|
|
9 |
"hidden_dropout_prob": 0.1,
|
10 |
"hidden_size": 768,
|
11 |
"id2label": {
|
12 |
-
"0": "
|
13 |
-
"1": "
|
14 |
-
"2": "
|
15 |
-
"3": "
|
16 |
-
"4": "
|
17 |
-
"5": "
|
18 |
-
"6": "
|
19 |
-
"7": "
|
20 |
-
"8": "
|
21 |
-
"9": "
|
22 |
-
"10": "
|
23 |
-
"11": "
|
24 |
-
"12": "
|
25 |
-
"13": "
|
26 |
-
"14": "
|
27 |
-
"15": "
|
28 |
-
"16": "
|
29 |
-
"17": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
},
|
31 |
"initializer_range": 0.02,
|
32 |
"intermediate_size": 3072,
|
33 |
"label2id": {
|
34 |
-
"Adres Detay":
|
35 |
-
"Bina Ad\u0131":
|
36 |
-
"Bina Numaras\u0131":
|
37 |
-
"Blok No":
|
38 |
-
"Bulvar":
|
39 |
-
"Cadde":
|
40 |
-
"Daire No":
|
41 |
-
"Kat":
|
42 |
-
"Mahalle":
|
43 |
-
"Posta Kodu":
|
44 |
-
"Site":
|
45 |
-
"Sokak":
|
46 |
-
"Yer Ad\u0131":
|
47 |
-
"
|
48 |
-
"
|
49 |
-
"\
|
50 |
-
"
|
51 |
-
"\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
},
|
53 |
"layer_norm_eps": 1e-12,
|
54 |
"max_position_embeddings": 512,
|
|
|
9 |
"hidden_dropout_prob": 0.1,
|
10 |
"hidden_size": 768,
|
11 |
"id2label": {
|
12 |
+
"0": "O",
|
13 |
+
"1": "B-\u00dclke",
|
14 |
+
"2": "I-\u00dclke",
|
15 |
+
"3": "B-\u0130l",
|
16 |
+
"4": "I-\u0130l",
|
17 |
+
"5": "B-\u0130l\u00e7e",
|
18 |
+
"6": "I-\u0130l\u00e7e",
|
19 |
+
"7": "B-Mahalle",
|
20 |
+
"8": "I-Mahalle",
|
21 |
+
"9": "B-Cadde",
|
22 |
+
"10": "I-Cadde",
|
23 |
+
"11": "B-Sokak",
|
24 |
+
"12": "I-Sokak",
|
25 |
+
"13": "B-Bina Ad\u0131",
|
26 |
+
"14": "I-Bina Ad\u0131",
|
27 |
+
"15": "B-Bina Numaras\u0131",
|
28 |
+
"16": "I-Bina Numaras\u0131",
|
29 |
+
"17": "B-Yer Ad\u0131",
|
30 |
+
"18": "I-Yer Ad\u0131",
|
31 |
+
"19": "B-Site",
|
32 |
+
"20": "I-Site",
|
33 |
+
"21": "B-Adres Detay",
|
34 |
+
"22": "I-Adres Detay",
|
35 |
+
"23": "B-Blok No",
|
36 |
+
"24": "I-Blok No",
|
37 |
+
"25": "B-Bulvar",
|
38 |
+
"26": "I-Bulvar",
|
39 |
+
"27": "B-Daire No",
|
40 |
+
"28": "I-Daire No",
|
41 |
+
"29": "B-Posta Kodu",
|
42 |
+
"30": "I-Posta Kodu",
|
43 |
+
"31": "B-Kat",
|
44 |
+
"32": "I-Kat"
|
45 |
},
|
46 |
"initializer_range": 0.02,
|
47 |
"intermediate_size": 3072,
|
48 |
"label2id": {
|
49 |
+
"B-Adres Detay": 21,
|
50 |
+
"B-Bina Ad\u0131": 13,
|
51 |
+
"B-Bina Numaras\u0131": 15,
|
52 |
+
"B-Blok No": 23,
|
53 |
+
"B-Bulvar": 25,
|
54 |
+
"B-Cadde": 9,
|
55 |
+
"B-Daire No": 27,
|
56 |
+
"B-Kat": 31,
|
57 |
+
"B-Mahalle": 7,
|
58 |
+
"B-Posta Kodu": 29,
|
59 |
+
"B-Site": 19,
|
60 |
+
"B-Sokak": 11,
|
61 |
+
"B-Yer Ad\u0131": 17,
|
62 |
+
"B-\u00dclke": 1,
|
63 |
+
"B-\u0130l": 3,
|
64 |
+
"B-\u0130l\u00e7e": 5,
|
65 |
+
"I-Adres Detay": 22,
|
66 |
+
"I-Bina Ad\u0131": 14,
|
67 |
+
"I-Bina Numaras\u0131": 16,
|
68 |
+
"I-Blok No": 24,
|
69 |
+
"I-Bulvar": 26,
|
70 |
+
"I-Cadde": 10,
|
71 |
+
"I-Daire No": 28,
|
72 |
+
"I-Kat": 32,
|
73 |
+
"I-Mahalle": 8,
|
74 |
+
"I-Posta Kodu": 30,
|
75 |
+
"I-Site": 20,
|
76 |
+
"I-Sokak": 12,
|
77 |
+
"I-Yer Ad\u0131": 18,
|
78 |
+
"I-\u00dclke": 2,
|
79 |
+
"I-\u0130l": 4,
|
80 |
+
"I-\u0130l\u00e7e": 6,
|
81 |
+
"O": 0
|
82 |
},
|
83 |
"layer_norm_eps": 1e-12,
|
84 |
"max_position_embeddings": 512,
|
model/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2ff0f793d2c61260659c6a327c27dd0ea1d632bc0e5fc51da60d20d3caf3f7f3
|
3 |
+
size 440231868
|
model/tokenizer.json
CHANGED
@@ -1,7 +1,21 @@
|
|
1 |
{
|
2 |
"version": "1.0",
|
3 |
-
"truncation":
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
"added_tokens": [
|
6 |
{
|
7 |
"id": 0,
|
|
|
1 |
{
|
2 |
"version": "1.0",
|
3 |
+
"truncation": {
|
4 |
+
"direction": "Right",
|
5 |
+
"max_length": 128,
|
6 |
+
"strategy": "LongestFirst",
|
7 |
+
"stride": 0
|
8 |
+
},
|
9 |
+
"padding": {
|
10 |
+
"strategy": {
|
11 |
+
"Fixed": 128
|
12 |
+
},
|
13 |
+
"direction": "Right",
|
14 |
+
"pad_to_multiple_of": null,
|
15 |
+
"pad_id": 0,
|
16 |
+
"pad_type_id": 0,
|
17 |
+
"pad_token": "[PAD]"
|
18 |
+
},
|
19 |
"added_tokens": [
|
20 |
{
|
21 |
"id": 0,
|
model/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4664
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:551b4a0b8523f76d65879932b7a7ba98935984c8de39d14af0fd2659e2aadadc
|
3 |
size 4664
|
predict.py
CHANGED
@@ -5,9 +5,6 @@ from transformers import BertTokenizerFast, AutoTokenizer
|
|
5 |
|
6 |
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
|
7 |
|
8 |
-
with open("labels.json", "r") as f:
|
9 |
-
id_to_label = {int(k): v for k, v in orjson.loads(f.read()).items()}
|
10 |
-
|
11 |
nlp = pipeline(
|
12 |
"ner",
|
13 |
model="./model",
|
@@ -19,20 +16,18 @@ def get_entities(tokens):
|
|
19 |
entities = []
|
20 |
entity = None
|
21 |
for token in tokens:
|
22 |
-
|
23 |
-
label = id_to_label[label_id]
|
24 |
-
if label.startswith("B-"):
|
25 |
if entity:
|
26 |
entity["score"] /= entity["token_count"]
|
27 |
entities.append(entity)
|
28 |
entity = {
|
29 |
-
"label":
|
30 |
"ranges": [token["start"], token["end"]],
|
31 |
"score": token["score"],
|
32 |
"token_count": 1,
|
33 |
}
|
34 |
-
elif
|
35 |
-
if entity and entity["label"] ==
|
36 |
entity["ranges"][1] = token["end"]
|
37 |
entity["token_count"] += 1
|
38 |
entity["score"] += token["score"]
|
|
|
5 |
|
6 |
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
|
7 |
|
|
|
|
|
|
|
8 |
nlp = pipeline(
|
9 |
"ner",
|
10 |
model="./model",
|
|
|
16 |
entities = []
|
17 |
entity = None
|
18 |
for token in tokens:
|
19 |
+
if token["entity"].startswith("B-"):
|
|
|
|
|
20 |
if entity:
|
21 |
entity["score"] /= entity["token_count"]
|
22 |
entities.append(entity)
|
23 |
entity = {
|
24 |
+
"label": token["entity"][2:],
|
25 |
"ranges": [token["start"], token["end"]],
|
26 |
"score": token["score"],
|
27 |
"token_count": 1,
|
28 |
}
|
29 |
+
elif token["entity"].startswith("I-"):
|
30 |
+
if entity and entity["label"] == token["entity"][2:]:
|
31 |
entity["ranges"][1] = token["end"]
|
32 |
entity["token_count"] += 1
|
33 |
entity["score"] += token["score"]
|
train.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import io
|
|
|
2 |
import requests
|
3 |
import json
|
4 |
import time
|
@@ -63,14 +64,11 @@ def load_data():
|
|
63 |
return labels, [orjson.loads(line) for line in data.split("\n") if line]
|
64 |
|
65 |
labels, data = load_data()
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
label_to_id = {label["text"]: i + 1 for i, label in enumerate(labels)}
|
72 |
-
label_to_id["[PAD]"] = 0
|
73 |
-
label_to_id["[UNK]"] = len(label_to_id)
|
74 |
id_to_label = {v: k for k, v in label_to_id.items()}
|
75 |
|
76 |
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
|
@@ -97,18 +95,20 @@ def preprocess_data(item, tokenizer, label_to_id):
|
|
97 |
attention_mask = inputs["attention_mask"]
|
98 |
offset_mapping = inputs["offset_mapping"]
|
99 |
|
100 |
-
labels = ["
|
|
|
101 |
for token_idx, [off_start, off_end] in enumerate(offset_mapping[0]):
|
102 |
if off_start == off_end:
|
103 |
continue
|
104 |
|
105 |
for start, end, label in item['label']:
|
106 |
if start <= off_start and off_end <= end:
|
107 |
-
|
|
|
|
|
|
|
|
|
108 |
break
|
109 |
-
|
110 |
-
if labels[token_idx] == "[PAD]":
|
111 |
-
labels[token_idx] = "[UNK]"
|
112 |
|
113 |
# Convert labels to ids
|
114 |
labels = [label_to_id[label] for label in labels]
|
@@ -132,6 +132,7 @@ class AddressDataset(Dataset):
|
|
132 |
return {key: torch.tensor(val) for key, val in item.items()}
|
133 |
|
134 |
|
|
|
135 |
dataset = Dataset.from_generator(
|
136 |
lambda: (preprocess_data(item, tokenizer, label_to_id) for item in data),
|
137 |
)
|
@@ -166,8 +167,8 @@ def compute_metrics(pred, id_to_label):
|
|
166 |
labels = [[id_to_label[label_id] for label_id in label_ids] for label_ids in labels]
|
167 |
preds = [[id_to_label[pred] for pred in preds] for preds in preds]
|
168 |
|
169 |
-
labels = [
|
170 |
-
preds = [
|
171 |
|
172 |
mlb = MultiLabelBinarizer()
|
173 |
mlb.fit([id_to_label.values()])
|
@@ -194,4 +195,6 @@ trainer = Trainer(
|
|
194 |
trainer.train()
|
195 |
trainer.evaluate()
|
196 |
|
197 |
-
trainer.save_model("./model")
|
|
|
|
|
|
1 |
import io
|
2 |
+
import shutil
|
3 |
import requests
|
4 |
import json
|
5 |
import time
|
|
|
64 |
return labels, [orjson.loads(line) for line in data.split("\n") if line]
|
65 |
|
66 |
labels, data = load_data()
|
67 |
+
label_to_id = {}
|
68 |
+
for i, label in enumerate(labels):
|
69 |
+
label_to_id["B-" + label["text"]] = i * 2 + 1
|
70 |
+
label_to_id["I-" + label["text"]] = i * 2 + 2
|
71 |
+
label_to_id["O"] = 0
|
|
|
|
|
|
|
72 |
id_to_label = {v: k for k, v in label_to_id.items()}
|
73 |
|
74 |
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
|
|
|
95 |
attention_mask = inputs["attention_mask"]
|
96 |
offset_mapping = inputs["offset_mapping"]
|
97 |
|
98 |
+
labels = ["O"] * 128
|
99 |
+
last_label = "O"
|
100 |
for token_idx, [off_start, off_end] in enumerate(offset_mapping[0]):
|
101 |
if off_start == off_end:
|
102 |
continue
|
103 |
|
104 |
for start, end, label in item['label']:
|
105 |
if start <= off_start and off_end <= end:
|
106 |
+
if last_label == label:
|
107 |
+
labels[token_idx] = "I-" + label
|
108 |
+
else:
|
109 |
+
labels[token_idx] = "B-" + label
|
110 |
+
last_label = label
|
111 |
break
|
|
|
|
|
|
|
112 |
|
113 |
# Convert labels to ids
|
114 |
labels = [label_to_id[label] for label in labels]
|
|
|
132 |
return {key: torch.tensor(val) for key, val in item.items()}
|
133 |
|
134 |
|
135 |
+
|
136 |
dataset = Dataset.from_generator(
|
137 |
lambda: (preprocess_data(item, tokenizer, label_to_id) for item in data),
|
138 |
)
|
|
|
167 |
labels = [[id_to_label[label_id] for label_id in label_ids] for label_ids in labels]
|
168 |
preds = [[id_to_label[pred] for pred in preds] for preds in preds]
|
169 |
|
170 |
+
labels = [label for label in labels if label != "O"]
|
171 |
+
preds = [pred for pred in preds if pred != "O"]
|
172 |
|
173 |
mlb = MultiLabelBinarizer()
|
174 |
mlb.fit([id_to_label.values()])
|
|
|
195 |
trainer.train()
|
196 |
trainer.evaluate()
|
197 |
|
198 |
+
trainer.save_model("./model")
|
199 |
+
|
200 |
+
shutil.copy("./model/config.json", "./config.json")
|