wissamantoun commited on
Commit
18d0924
1 Parent(s): c4b0d7f

Update README.md and config.json, and add aragpt2-large model to ARAGPT2_PRETRAINED_MODEL_ARCHIVE_LIST

Browse files
Files changed (3) hide show
  1. README.md +16 -13
  2. config.json +2 -1
  3. modeling_aragpt2.py +1 -0
README.md CHANGED
@@ -6,9 +6,9 @@ license_link: https://github.com/aub-mind/arabert/blob/master/aragpt2/LICENSE
6
  datasets:
7
  - wikipedia
8
  - Osian
9
- - 1.5B-Arabic-Corpus
10
- - oscar-arabic-unshuffled
11
- - Assafir(private)
12
  inference: false
13
  widget:
14
  - text: "يحكى أن مزارعا مخادعا قام ببيع بئر الماء الموجود في أرضه لجاره مقابل مبلغ كبير من المال"
@@ -18,6 +18,8 @@ widget:
18
 
19
  # Arabic GPT2
20
 
 
 
21
  <img src="https://raw.githubusercontent.com/aub-mind/arabert/master/AraGPT2.png" width="100" align="left"/>
22
 
23
  You can find more information in our paper [AraGPT2](https://arxiv.org/abs/2012.15520)
@@ -32,18 +34,17 @@ Both models are trained using the `adafactor` optimizer, since the `adam` and `l
32
 
33
  AraGPT2 is trained on the same large Arabic Dataset as AraBERTv2.
34
 
35
- # Usage
 
 
36
 
37
  ## Testing the model using `transformers`:
38
 
39
- You need to use the GPT2LMHeadModel from `arabert`: `pip install arabert`
 
40
 
41
  ```python
42
- from transformers import GPT2TokenizerFast, pipeline
43
- #for base and medium
44
- from transformers import GPT2LMHeadModel
45
- #for large and mega
46
- from arabert.aragpt2.grover.modeling_gpt2 import GPT2LMHeadModel
47
 
48
  from arabert.preprocess import ArabertPreprocessor
49
 
@@ -53,13 +54,15 @@ arabert_prep = ArabertPreprocessor(model_name=MODEL_NAME)
53
  text=""
54
  text_clean = arabert_prep.preprocess(text)
55
 
56
- model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
57
  tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_NAME)
58
- generation_pipeline = pipeline("text-generation",model=model,tokenizer=tokenizer)
 
 
59
 
60
  #feel free to try different decoding settings
61
  generation_pipeline(text,
62
- pad_token_id=tokenizer.eos_token_id,
63
  num_beams=10,
64
  max_length=200,
65
  top_p=0.9,
 
6
  datasets:
7
  - wikipedia
8
  - Osian
9
+ - arabic-billion-words
10
+ - oscar
11
+ - Assafir-private
12
  inference: false
13
  widget:
14
  - text: "يحكى أن مزارعا مخادعا قام ببيع بئر الماء الموجود في أرضه لجاره مقابل مبلغ كبير من المال"
 
18
 
19
  # Arabic GPT2
20
 
21
+
22
+
23
  <img src="https://raw.githubusercontent.com/aub-mind/arabert/master/AraGPT2.png" width="100" align="left"/>
24
 
25
  You can find more information in our paper [AraGPT2](https://arxiv.org/abs/2012.15520)
 
34
 
35
  AraGPT2 is trained on the same large Arabic Dataset as AraBERTv2.
36
 
37
+
38
+ # NOTE: The model expects the input to be preprocessed using the `arabert` library.
39
+ if not the model won't be able to generate the correct output.
40
 
41
  ## Testing the model using `transformers`:
42
 
43
+ The model code is now hosted on HuggingFace so you need to use the `trust_remote_code` flag, and can be used as follows:
44
+
45
 
46
  ```python
47
+ from transformers import AutoModelForCausalLM, pipeline
 
 
 
 
48
 
49
  from arabert.preprocess import ArabertPreprocessor
50
 
 
54
  text=""
55
  text_clean = arabert_prep.preprocess(text)
56
 
57
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True)
58
  tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_NAME)
59
+ generation_pipeline = pipeline(
60
+ "text-generation", model=MODEL_NAME, trust_remote_code=True
61
+ )
62
 
63
  #feel free to try different decoding settings
64
  generation_pipeline(text,
65
+ pad_token_id=pipeline.tokenizer.eos_token_id,
66
  num_beams=10,
67
  max_length=200,
68
  top_p=0.9,
config.json CHANGED
@@ -41,5 +41,6 @@
41
  "no_repeat_ngram_size": 3
42
  }
43
  },
44
- "vocab_size": 64000
 
45
  }
 
41
  "no_repeat_ngram_size": 3
42
  }
43
  },
44
+ "vocab_size": 64000,
45
+ "tokenizer_class": "GPT2Tokenizer"
46
  }
modeling_aragpt2.py CHANGED
@@ -46,6 +46,7 @@ _CONFIG_FOR_DOC = "AraGPT2Config"
46
  _TOKENIZER_FOR_DOC = "GPT2Tokenizer"
47
 
48
  ARAGPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
 
49
  "aubmindlab/aragpt2-mega",
50
  # See all AraGPT2 models at https://huggingface.co/models?filter=aragpt2
51
  ]
 
46
  _TOKENIZER_FOR_DOC = "GPT2Tokenizer"
47
 
48
  ARAGPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
49
+ "aubmindlab/aragpt2-large",
50
  "aubmindlab/aragpt2-mega",
51
  # See all AraGPT2 models at https://huggingface.co/models?filter=aragpt2
52
  ]