Commit
ยท
bdba8b7
1
Parent(s):
dafba98
Update README.md
Browse files
README.md
CHANGED
@@ -2,6 +2,18 @@
|
|
2 |
|
3 |
This is a roBERTa-base model trained on ~58M tweets, described and evaluated in the [_TweetEval_ benchmark (Findings of EMNLP 2020)](https://arxiv.org/pdf/2010.12421.pdf). To evaluate this and other LMs on Twitter-specific data, please refer to the [Tweeteval official repository](https://github.com/cardiffnlp/tweeteval).
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
## Example Masked Language Model
|
6 |
|
7 |
```python
|
@@ -23,8 +35,9 @@ texts = [
|
|
23 |
"I am so <mask> ๐ข"
|
24 |
]
|
25 |
for text in texts:
|
26 |
-
|
27 |
-
|
|
|
28 |
print_candidates()
|
29 |
```
|
30 |
|
@@ -55,7 +68,7 @@ import numpy as np
|
|
55 |
|
56 |
MODEL = "cardiffnlp/twitter-roberta-base"
|
57 |
text = "Good night ๐"
|
58 |
-
|
59 |
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
60 |
|
61 |
# Pytorch
|
|
|
2 |
|
3 |
This is a roBERTa-base model trained on ~58M tweets, described and evaluated in the [_TweetEval_ benchmark (Findings of EMNLP 2020)](https://arxiv.org/pdf/2010.12421.pdf). To evaluate this and other LMs on Twitter-specific data, please refer to the [Tweeteval official repository](https://github.com/cardiffnlp/tweeteval).
|
4 |
|
5 |
+
## Preprocess Text
|
6 |
+
Replace usernames and links for placeholders: "@user" and "http".
|
7 |
+
```python
|
8 |
+
def preprocess(text):
|
9 |
+
new_text = []
|
10 |
+
for t in text.split(" "):
|
11 |
+
t = '@user' if t.startswith('@') and len(t) > 1 else t
|
12 |
+
t = 'http' if t.startswith('http') else t
|
13 |
+
new_text.append(t)
|
14 |
+
return " ".join(new_text)
|
15 |
+
```
|
16 |
+
|
17 |
## Example Masked Language Model
|
18 |
|
19 |
```python
|
|
|
35 |
"I am so <mask> ๐ข"
|
36 |
]
|
37 |
for text in texts:
|
38 |
+
t = preprocess(text)
|
39 |
+
print(f"{'-'*30}\n{t}")
|
40 |
+
candidates = fill_mask(t)
|
41 |
print_candidates()
|
42 |
```
|
43 |
|
|
|
68 |
|
69 |
MODEL = "cardiffnlp/twitter-roberta-base"
|
70 |
text = "Good night ๐"
|
71 |
+
text = preprocess(text)
|
72 |
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
73 |
|
74 |
# Pytorch
|