loubnabnl HF staff commited on
Commit
0d5adbc
Β·
1 Parent(s): a111b54
Files changed (1) hide show
  1. app.py +88 -96
app.py CHANGED
@@ -10,7 +10,7 @@ GITHUB_CODE = "https://huggingface.co/datasets/lvwerra/github-code"
10
  INCODER_IMG = (
11
  "https://huggingface.co/datasets/loubnabnl/repo-images/raw/main/incoder.png"
12
  )
13
-
14
 
15
  @st.cache()
16
  def load_examples():
@@ -32,100 +32,92 @@ def generate_code(model_name, gen_prompt, max_new_tokens, temperature, seed):
32
 
33
  st.set_page_config(page_icon=":laptop:", layout="wide")
34
 
35
- st.sidebar.header("Models")
36
- models = ["CodeParrot", "InCoder"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  selected_models = st.sidebar.multiselect(
38
- "Select code generation models to compare", models, default=["CodeParrot"]
39
  )
40
-
41
- st.sidebar.header("Tasks")
42
- tasks = [
43
- " ",
44
- "Pretraining datasets",
45
- "Model architecture",
46
- "Model evaluation",
47
- "Code generation",
48
- ]
49
- selected_task = st.sidebar.selectbox("Select a task", tasks)
50
-
51
-
52
- if selected_task == " ":
53
- st.title("Code Generation Models")
54
- with open("utils/intro.txt", "r") as f:
55
- intro = f.read()
56
- st.markdown(intro)
57
-
58
- elif selected_task == "Pretraining datasets":
59
- st.title("Pretraining datasets πŸ“š")
60
- st.markdown(
61
- f"Preview of some code files from Github repositories in [Github-code dataset]({GITHUB_CODE}):"
62
- )
63
- df = pd.read_csv("utils/data_preview.csv")
64
- st.dataframe(df)
65
- for model in selected_models:
66
- with open(f"datasets/{model.lower()}.txt", "r") as f:
67
- text = f.read()
68
- st.markdown(f"### {model}")
69
- st.markdown(text)
70
-
71
- elif selected_task == "Model architecture":
72
- st.title("Model architecture")
73
- for model in selected_models:
74
- with open(f"architectures/{model.lower()}.txt", "r") as f:
75
- text = f.read()
76
- st.markdown(f"## {model}")
77
- st.markdown(text)
78
- if model == "InCoder":
79
- st.image(INCODER_IMG, caption="Figure 1: InCoder training", width=700)
80
-
81
- elif selected_task == "Model evaluation":
82
- st.title("Code models evaluation πŸ“Š")
83
- with open("evaluation/intro.txt", "r") as f:
84
- intro = f.read()
85
- st.markdown(intro)
86
-
87
- elif selected_task == "Code generation":
88
- st.title("Code generation πŸ’»")
89
- st.sidebar.header("Examples")
90
- examples = load_examples()
91
- example_names = [example["name"] for example in examples]
92
- name2id = dict([(name, i) for i, name in enumerate(example_names)])
93
- selected_example = st.sidebar.selectbox(
94
- "Select one of the following examples or implement yours", example_names
95
- )
96
- example_text = examples[name2id[selected_example]]["value"]
97
- default_length = examples[name2id[selected_example]]["length"]
98
- st.sidebar.header("Generation settings")
99
- temperature = st.sidebar.slider(
100
- "Temperature:", value=0.2, min_value=0.0, step=0.1, max_value=2.0
101
- )
102
- max_new_tokens = st.sidebar.slider(
103
- "Number of tokens to generate:",
104
- value=default_length,
105
- min_value=8,
106
- step=8,
107
- max_value=256,
108
- )
109
- seed = st.sidebar.slider(
110
- "Random seed:", value=42, min_value=0, step=1, max_value=1000
111
- )
112
- gen_prompt = st.text_area(
113
- "Generate code with prompt:",
114
- value=example_text,
115
- height=220,
116
- ).strip()
117
- if st.button("Generate code!"):
118
- with st.spinner("Generating code..."):
119
- # Create a multiprocessing Pool
120
- pool = Pool()
121
- generate_parallel = partial(
122
- generate_code,
123
- gen_prompt=gen_prompt,
124
- max_new_tokens=max_new_tokens,
125
- temperature=temperature,
126
- seed=seed,
127
- )
128
- output = pool.map(generate_parallel, selected_models)
129
- for i in range(len(output)):
130
- st.markdown(f"**{selected_models[i]}**")
131
- st.code(output[i])
 
10
  INCODER_IMG = (
11
  "https://huggingface.co/datasets/loubnabnl/repo-images/raw/main/incoder.png"
12
  )
13
+ MODELS = ["CodeParrot", "InCoder"]
14
 
15
  @st.cache()
16
  def load_examples():
 
32
 
33
  st.set_page_config(page_icon=":laptop:", layout="wide")
34
 
35
+ # Introduction
36
+ st.title("Code generation with πŸ€—")
37
+ with open("utils/intro.txt", "r") as f:
38
+ intro = f.read()
39
+ st.markdown(intro)
40
+
41
+ # Pretraining datasets
42
+ st.title("1 - Pretraining datasets πŸ“š")
43
+ st.markdown(
44
+ f"Preview of some code files from Github repositories in [Github-code dataset]({GITHUB_CODE}):"
45
+ )
46
+ df = pd.read_csv("utils/data_preview.csv")
47
+ st.dataframe(df)
48
+ st.header("Model")
49
+ selected_model = st.selectbox(
50
+ "Select a code generation model", MODELS, default=["CodeParrot"]
51
+ )
52
+ with open(f"datasets/{selected_model.lower()}.txt", "r") as f:
53
+ text = f.read()
54
+ st.markdown(text)
55
+
56
+ # Model architecture
57
+ st.title("Model architecture")
58
+ st.markdow("Most code generation models use GPT style architectures trained on code. Some use encoder-decoder architectures such as AlphaCode.")
59
+ st.header("Model")
60
+ selected_model = st.selectbox(
61
+ "Select a code generation model", MODELS, default=["CodeParrot"]
62
+ )
63
+ with open(f"architectures/{selected_model.lower()}.txt", "r") as f:
64
+ text = f.read()
65
+ st.markdown(text)
66
+ if model == "InCoder":
67
+ st.image(INCODER_IMG, caption="Figure 1: InCoder training", width=700)
68
+
69
+ # Model evaluation
70
+ st.title("Code models evaluation πŸ“Š")
71
+ with open("evaluation/intro.txt", "r") as f:
72
+ intro = f.read()
73
+ st.markdown(intro)
74
+
75
+ # Code generation
76
+ st.title("Code generation πŸ’»")
77
+ st.header("Models")
78
  selected_models = st.sidebar.multiselect(
79
+ "Select code generation models to compare", MODELS, default=["CodeParrot"]
80
  )
81
+ st.header("Examples")
82
+ examples = load_examples()
83
+ example_names = [example["name"] for example in examples]
84
+ name2id = dict([(name, i) for i, name in enumerate(example_names)])
85
+ selected_example = st.selectbox(
86
+ "Select one of the following examples or implement yours", example_names
87
+ )
88
+ example_text = examples[name2id[selected_example]]["value"]
89
+ default_length = examples[name2id[selected_example]]["length"]
90
+ st.header("Generation settings")
91
+ temperature = st.slider(
92
+ "Temperature:", value=0.2, min_value=0.0, step=0.1, max_value=2.0
93
+ )
94
+ max_new_tokens = st.slider(
95
+ "Number of tokens to generate:",
96
+ value=default_length,
97
+ min_value=8,
98
+ step=8,
99
+ max_value=256,
100
+ )
101
+ seed = st.slider(
102
+ "Random seed:", value=42, min_value=0, step=1, max_value=1000
103
+ )
104
+ gen_prompt = st.text_area(
105
+ "Generate code with prompt:",
106
+ value=example_text,
107
+ height=220,
108
+ ).strip()
109
+ if st.button("Generate code!"):
110
+ with st.spinner("Generating code..."):
111
+ # Create a multiprocessing Pool
112
+ pool = Pool()
113
+ generate_parallel = partial(
114
+ generate_code,
115
+ gen_prompt=gen_prompt,
116
+ max_new_tokens=max_new_tokens,
117
+ temperature=temperature,
118
+ seed=seed,
119
+ )
120
+ output = pool.map(generate_parallel, selected_models)
121
+ for i in range(len(output)):
122
+ st.markdown(f"**{selected_models[i]}**")
123
+ st.code(output[i])