cakiki loubnabnl HF staff commited on
Commit
ddc2323
Β·
1 Parent(s): 67fe836

add InCoder and CodeGen tokenizers for comparison (#1)

Browse files

- add InCoder and CodeGen tokenizers for comparison (6352d45cef72c9eebddc9c78461f965b8c6310b0)


Co-authored-by: loubna ben allal <[email protected]>

Files changed (1) hide show
  1. app.py +17 -2
app.py CHANGED
@@ -4,11 +4,26 @@ from transformers import AutoTokenizer
4
 
5
  st.set_page_config(page_title="BigCode Tokenizer", page_icon='πŸ‘©β€πŸ’»', layout="wide")
6
 
7
- tokenizer = AutoTokenizer.from_pretrained('bigcode/tokenizer', subfolder="digit-custom_punctuation-bytelevel-bpe-py-js-java-50k")
8
- visualizer = EncodingVisualizer(tokenizer=tokenizer._tokenizer, default_to_notebook=False)
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  text = st.text_area(label="", placeholder="Text to tokenize")
11
  button_clicked = st.button("Tokenize")
 
 
 
 
12
  if text or button_clicked:
13
  st.write(f"The input was split into {len(tokenizer.tokenize(text))} tokens.")
14
  st.components.v1.html(visualizer(text), height=1500)
 
4
 
5
  st.set_page_config(page_title="BigCode Tokenizer", page_icon='πŸ‘©β€πŸ’»', layout="wide")
6
 
7
+ models = ["bigcode/tokenizer", "facebook/incoder-6B", "Salesforce/codegen-16B-mono"]
8
+
9
+ @st.cache()
10
+ def load_tokenizer():
11
+ if selected_model == "bigcode/tokenizer":
12
+ tokenizer = AutoTokenizer.from_pretrained("bigcode/tokenizer", subfolder="digit-custom_punctuation-bytelevel-bpe-py-js-java-50k")
13
+ else:
14
+ tokenizer = AutoTokenizer.from_pretrained(selected_model)
15
+ return tokenizer
16
+
17
+ col1, col2 = st.columns([1, 2])
18
+ with col1:
19
+ selected_model = st.selectbox("Select a tokenizer", models, key=1)
20
 
21
  text = st.text_area(label="", placeholder="Text to tokenize")
22
  button_clicked = st.button("Tokenize")
23
+
24
+ tokenizer = load_tokenizer()
25
+ visualizer = EncodingVisualizer(tokenizer=tokenizer._tokenizer, default_to_notebook=False)
26
+
27
  if text or button_clicked:
28
  st.write(f"The input was split into {len(tokenizer.tokenize(text))} tokens.")
29
  st.components.v1.html(visualizer(text), height=1500)