Ivan Tan commited on
Commit
c073aa2
·
1 Parent(s): 19c813f

Init repo with app, data and models

Browse files
.gitattributes CHANGED
@@ -29,3 +29,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
  *.zip filter=lfs diff=lfs merge=lfs -text
30
  *.zstandard filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
29
  *.zip filter=lfs diff=lfs merge=lfs -text
30
  *.zstandard filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
32
+ train.csv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # In[10]:
5
+
6
+
7
+ import pandas as pd
8
+ import os
9
+ import torch
10
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
11
+ from transformers.optimization import Adafactor
12
+ import time
13
+ import warnings
14
+ import random
15
+ warnings.filterwarnings('ignore')
16
+
17
+ import re
18
+
19
+ def strip_html(text):
20
+ return re.sub('<[^<]+?>', '', text)
21
+
22
+
23
+ # In[5]:
24
+
25
+
26
+ train_columns = ['round_amount', 'round_date', 'stage', 'investee',
27
+ 'investee_description', 'investee_country', 'investee_region',
28
+ 'investee_subregion', 'investee_vertical', 'investee_industry',
29
+ 'investor_list', 'previous_investors', 'prior_funding']
30
+ train = pd.read_csv("train.csv")
31
+
32
+
33
+ # In[6]:
34
+
35
+
36
+ train.publication_timestamp = pd.to_datetime(train.publication_timestamp)
37
+
38
+
39
+ # In[7]:
40
+
41
+
42
+ input_text = train[train_columns].to_dict(orient='records')
43
+ train_df = train[['title']].rename(columns={'title':'target_text'})
44
+ train_df['input_text'] = input_text
45
+ train_df['prefix'] = 'tia'
46
+ train_df.input_text = train_df.input_text.astype(str)
47
+
48
+
49
+ # In[8]:
50
+
51
+
52
+ if torch.cuda.is_available():
53
+ dev = torch.device("cuda:0")
54
+ print("Running on the GPU")
55
+ else:
56
+ dev = torch.device("cpu")
57
+ print("Running on the CPU")
58
+
59
+
60
+ # In[ ]:
61
+
62
+
63
+ tokenizer = T5Tokenizer.from_pretrained('google/t5-v1_1-base')
64
+ model = T5ForConditionalGeneration.from_pretrained('t5-v1_1-base_tia/', local_files_only=True)
65
+ #moving the model to device(GPU/CPU)
66
+ model.to(dev)
67
+
68
+
69
+ # In[12]:
70
+
71
+
72
+ vi_table = train[['investee_industry', 'investee_vertical']].drop_duplicates()
73
+
74
+
75
+ # In[13]:
76
+
77
+
78
+ def update_industry(value):
79
+ verticals = list(vi_table[vi_table['investee_industry'] == value]['investee_vertical'].values)
80
+ return verticals[0]
81
+
82
+ def update_vertical(value):
83
+ industries = list(vi_table[vi_table['investee_vertical'] == value]['investee_industry'].values)
84
+ return industries[0]
85
+
86
+
87
+ # In[ ]:
88
+
89
+
90
+ update_industry('Green')
91
+
92
+
93
+ # In[ ]:
94
+
95
+
96
+ update_vertical('Clean tech')
97
+
98
+
99
+ # In[ ]:
100
+
101
+
102
+ import gradio as gr
103
+
104
+
105
+ # In[ ]:
106
+
107
+
108
+ num_return_sequences = 5
109
+
110
+
111
+ # In[ ]:
112
+
113
+
114
+ def generate_headline(stage, investee_country, investee_subregion, investee_region,
115
+ investee_vertical, investee_industry,
116
+ round_amount, investee, investee_description, investor_list, previous_investors,
117
+ other_values):
118
+
119
+ full_df = other_values.set_index("key").T
120
+
121
+ full_df['stage'] = stage
122
+ full_df['investee_country'] = investee_country
123
+ full_df['investee_subregion'] = investee_subregion
124
+ full_df['investee_region'] = investee_region
125
+ full_df['investee_vertical'] = investee_vertical
126
+ full_df['investee_industry'] = investee_industry
127
+ full_df['round_amount'] = str(float(round_amount))
128
+ full_df['investee'] = investee
129
+ full_df['investee_description'] = investee_description
130
+ full_df['investor_list'] = investor_list
131
+ full_df['previous_investors'] = previous_investors
132
+
133
+ random_set =full_df[['round_amount', 'round_date', 'stage', 'investee',
134
+ 'investee_description', 'investee_country', 'investee_region',
135
+ 'investee_subregion', 'investee_vertical', 'investee_industry',
136
+ 'investor_list', 'previous_investors', 'prior_funding']].to_json(orient="records")
137
+ # print(random_set)
138
+
139
+ input_ids = tokenizer.encode(f"tia: {{{random_set}}}", return_tensors="pt") # Batch size 1
140
+ input_ids=input_ids.to(dev)
141
+ outputs = model.generate(input_ids)
142
+ # text_output = tokenizer.decode(outputs[0]) # Single output
143
+ text_outputs = model.generate(inputs=input_ids, do_sample=True,
144
+ num_beams=2,
145
+ num_return_sequences=num_return_sequences,
146
+ repetition_penalty=5.0)
147
+ outputs = [strip_html(tokenizer.decode(o)) for o in text_outputs]
148
+ return "\n".join(outputs)
149
+
150
+
151
+ # In[ ]:
152
+
153
+
154
+ other_columns = ['round_date', 'prior_funding']
155
+
156
+
157
+ # In[ ]:
158
+
159
+
160
+ train.sample(1)[other_columns].T.reset_index().values
161
+
162
+
163
+ # In[ ]:
164
+
165
+
166
+ print(train.query("investee == 'NOSH'")['title'].head(1).T)
167
+ train.query("investee == 'NOSH'")[train_columns].head(1).T
168
+
169
+
170
+ # In[ ]:
171
+
172
+
173
+ fake_data = {
174
+ "round_amount":1000000.0,
175
+ "round_date":"2018-09-26",
176
+ "stage":"Pre-series A",
177
+ "investee":"NOSH",
178
+ "investee_description":"NOSH makes and delivers ready-to-eat meals in Hong Kong.",
179
+ "investee_country":"Hong Kong",
180
+ "investee_region":"Asia",
181
+ "investee_subregion":"Eastern Asia",
182
+ "investee_vertical":"Food tech",
183
+ "investee_industry":"Restaurants & Food",
184
+ "investor_list":["Alibaba Entrepreneurs Fund (阿里巴巴创业者基金)"],
185
+ "previous_investors":"",
186
+ "prior_funding":1000000.0
187
+ }
188
+
189
+
190
+ # In[ ]:
191
+
192
+
193
+ pd.DataFrame([fake_data]).T
194
+
195
+
196
+ # In[ ]:
197
+
198
+
199
+ demo = gr.Blocks()
200
+
201
+ random_sample = train[train_columns].sample(1)
202
+ random_sample = pd.DataFrame([fake_data])
203
+
204
+ stage = gr.Dropdown(label="stage", choices=list(train[train_columns].stage.unique()))
205
+ investee_country = gr.Dropdown(label="investee_country", choices=list(train[train_columns].investee_country.unique()),
206
+ value=random_sample.investee_country.values[0])
207
+ investee_subregion = gr.Dropdown(label="investee_subregion", choices=list(train[train_columns].investee_subregion.unique()),
208
+ value=random_sample.investee_subregion.values[0])
209
+ investee_region = gr.Dropdown(label="investee_region", choices=list(train[train_columns].investee_region.unique()),
210
+ value=random_sample.investee_region.values[0])
211
+ investee_vertical = gr.Dropdown(label="investee_vertical", choices=list(train[train_columns].investee_vertical.unique()),
212
+ value=random_sample.investee_vertical.values[0])
213
+ investee_industry = gr.Dropdown(label="investee_industry", choices=list(train[train_columns].investee_industry.unique()),
214
+ value=random_sample.investee_industry.values[0])
215
+
216
+ if pd.isnull(random_sample.round_amount.values[0]):
217
+ rand_amount = 0
218
+ else:
219
+ rand_amount = random_sample.round_amount.values[0]
220
+
221
+ round_amount = gr.Slider(label="round_amount", minimum=100000, maximum=200000000,
222
+ value=rand_amount,
223
+ step=100000)
224
+
225
+ investee = gr.Textbox(label="investee", value=random_sample.investee.values[0])
226
+ investee_description = gr.Textbox(label="investee_description",
227
+ value=random_sample.investee_description.values[0])
228
+ investor_list = gr.Textbox(label="investor_list",
229
+ value=random_sample.investor_list.values[0])
230
+ previous_investors = gr.Textbox(label="previous_investors",
231
+ value=random_sample.previous_investors.values[0])
232
+ other_values = gr.Dataframe(
233
+ headers=['key', 'value'],
234
+ value=[['round_date', random_sample.round_date.values[0]],
235
+ ['prior_funding', random_sample.prior_funding.values[0]]]
236
+ )
237
+ out = gr.Textbox(max_lines=num_return_sequences)
238
+
239
+ with demo:
240
+ gr.Markdown("Enter funding data to generate news headline.")
241
+
242
+ inputs=[stage, investee_country, investee_subregion, investee_region,
243
+ investee_vertical, investee_industry,
244
+ round_amount, investee, investee_description, investor_list, previous_investors,
245
+ other_values]
246
+
247
+ investee_industry.change(fn=update_industry, inputs=investee_industry, outputs=investee_vertical)
248
+ investee_vertical.change(fn=update_vertical, inputs=investee_vertical, outputs=investee_industry)
249
+ gr.Interface(fn=generate_headline, inputs=inputs, outputs=out, live=True)
250
+ description="Enter funding data to generate news headline.",
251
+ live=True
252
+
253
+ demo.launch(
254
+ share=False, auth=("123", "123")
255
+ )
256
+
257
+
258
+ # In[76]:
259
+
260
+
261
+ demo.close()
262
+
263
+
264
+ # In[77]:
265
+
266
+
267
+ gr.close_all()
268
+
269
+
270
+ # In[ ]:
271
+
272
+
273
+
274
+
275
+
276
+ # In[ ]:
277
+
278
+
279
+
280
+
281
+
282
+ # In[ ]:
283
+
284
+
285
+
286
+
287
+
288
+ # In[ ]:
289
+
290
+
291
+
292
+
293
+
294
+ # In[ ]:
295
+
296
+
297
+
298
+
t5-v1_1-base_tia/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/t5-v1_1-base",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 2048,
7
+ "d_kv": 64,
8
+ "d_model": 768,
9
+ "decoder_start_token_id": 0,
10
+ "dense_act_fn": "gelu_new",
11
+ "dropout_rate": 0.1,
12
+ "eos_token_id": 1,
13
+ "feed_forward_proj": "gated-gelu",
14
+ "initializer_factor": 1.0,
15
+ "is_encoder_decoder": true,
16
+ "is_gated_act": true,
17
+ "layer_norm_epsilon": 1e-06,
18
+ "model_type": "t5",
19
+ "num_decoder_layers": 12,
20
+ "num_heads": 12,
21
+ "num_layers": 12,
22
+ "output_past": true,
23
+ "pad_token_id": 0,
24
+ "relative_attention_max_distance": 128,
25
+ "relative_attention_num_buckets": 32,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.20.1",
29
+ "use_cache": true,
30
+ "vocab_size": 32128
31
+ }
t5-v1_1-base_tia/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15c826dfd8c15a7750d1961a646e49f145d669052224b39ac7f5c698422ba13e
3
+ size 990406605
train.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:999febaa4d0e013cb0c89ba43c657bfdf13d9d7d8e52f4050b64341ff833489d
3
+ size 34332589