Spaces:

IvanTIA
/

TIABot

Runtime error

App Files Files Community

Ivan Tan commited on Aug 2, 2022

Commit

c073aa2

1 Parent(s): 19c813f

Init repo with app, data and models

Browse files

Files changed (5) hide show

.gitattributes +1 -0
app.py +298 -0
t5-v1_1-base_tia/config.json +31 -0
t5-v1_1-base_tia/pytorch_model.bin +3 -0
train.csv +3 -0

.gitattributes CHANGED Viewed

@@ -29,3 +29,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+train.csv filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,298 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[10]:
+import pandas as pd
+import os
+import torch
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+from transformers.optimization import  Adafactor
+import time
+import warnings
+import random
+warnings.filterwarnings('ignore')
+import re
+def strip_html(text):
+    return re.sub('<[^<]+?>', '', text)
+# In[5]:
+train_columns = ['round_amount', 'round_date', 'stage', 'investee',
+       'investee_description', 'investee_country', 'investee_region',
+       'investee_subregion', 'investee_vertical', 'investee_industry',
+       'investor_list', 'previous_investors', 'prior_funding']
+train = pd.read_csv("train.csv")
+# In[6]:
+train.publication_timestamp = pd.to_datetime(train.publication_timestamp)
+# In[7]:
+input_text = train[train_columns].to_dict(orient='records')
+train_df = train[['title']].rename(columns={'title':'target_text'})
+train_df['input_text'] = input_text
+train_df['prefix'] = 'tia'
+train_df.input_text = train_df.input_text.astype(str)
+# In[8]:
+if torch.cuda.is_available():
+    dev = torch.device("cuda:0")
+    print("Running on the GPU")
+else:
+    dev = torch.device("cpu")
+    print("Running on the CPU")
+# In[ ]:
+tokenizer = T5Tokenizer.from_pretrained('google/t5-v1_1-base')
+model = T5ForConditionalGeneration.from_pretrained('t5-v1_1-base_tia/', local_files_only=True)
+#moving the model to device(GPU/CPU)
+model.to(dev)
+# In[12]:
+vi_table = train[['investee_industry', 'investee_vertical']].drop_duplicates()
+# In[13]:
+def update_industry(value):
+    verticals = list(vi_table[vi_table['investee_industry'] == value]['investee_vertical'].values)
+    return verticals[0]
+def update_vertical(value):
+    industries = list(vi_table[vi_table['investee_vertical'] == value]['investee_industry'].values)
+    return industries[0]
+# In[ ]:
+update_industry('Green')
+# In[ ]:
+update_vertical('Clean tech')
+# In[ ]:
+import gradio as gr
+# In[ ]:
+num_return_sequences = 5
+# In[ ]:
+def generate_headline(stage, investee_country, investee_subregion, investee_region,
+                      investee_vertical, investee_industry,
+                      round_amount, investee, investee_description, investor_list, previous_investors,
+                      other_values):
+    full_df = other_values.set_index("key").T
+    full_df['stage'] = stage
+    full_df['investee_country'] = investee_country
+    full_df['investee_subregion'] = investee_subregion
+    full_df['investee_region'] = investee_region
+    full_df['investee_vertical'] = investee_vertical
+    full_df['investee_industry'] = investee_industry
+    full_df['round_amount'] = str(float(round_amount))
+    full_df['investee'] = investee
+    full_df['investee_description'] = investee_description
+    full_df['investor_list'] = investor_list
+    full_df['previous_investors'] = previous_investors
+    random_set =full_df[['round_amount', 'round_date', 'stage', 'investee',
+       'investee_description', 'investee_country', 'investee_region',
+       'investee_subregion', 'investee_vertical', 'investee_industry',
+       'investor_list', 'previous_investors', 'prior_funding']].to_json(orient="records")
+#    print(random_set)
+    input_ids = tokenizer.encode(f"tia: {{{random_set}}}", return_tensors="pt")  # Batch size 1
+    input_ids=input_ids.to(dev)
+    outputs = model.generate(input_ids)
+    # text_output = tokenizer.decode(outputs[0]) # Single output
+    text_outputs = model.generate(inputs=input_ids, do_sample=True,
+                                  num_beams=2,
+                                  num_return_sequences=num_return_sequences,
+                                 repetition_penalty=5.0)
+    outputs = [strip_html(tokenizer.decode(o)) for o in text_outputs]
+    return "\n".join(outputs)
+# In[ ]:
+other_columns = ['round_date', 'prior_funding']
+# In[ ]:
+train.sample(1)[other_columns].T.reset_index().values
+# In[ ]:
+print(train.query("investee == 'NOSH'")['title'].head(1).T)
+train.query("investee == 'NOSH'")[train_columns].head(1).T
+# In[ ]:
+fake_data = {
+   "round_amount":1000000.0,
+   "round_date":"2018-09-26",
+   "stage":"Pre-series A",
+   "investee":"NOSH",
+   "investee_description":"NOSH makes and delivers ready-to-eat meals in Hong Kong.",
+   "investee_country":"Hong Kong",
+   "investee_region":"Asia",
+   "investee_subregion":"Eastern Asia",
+   "investee_vertical":"Food tech",
+   "investee_industry":"Restaurants & Food",
+   "investor_list":["Alibaba Entrepreneurs Fund (阿里巴巴创业者基金)"],
+   "previous_investors":"",
+   "prior_funding":1000000.0
+}
+# In[ ]:
+pd.DataFrame([fake_data]).T
+# In[ ]:
+demo = gr.Blocks()
+random_sample = train[train_columns].sample(1)
+random_sample = pd.DataFrame([fake_data])
+stage = gr.Dropdown(label="stage", choices=list(train[train_columns].stage.unique()))
+investee_country = gr.Dropdown(label="investee_country", choices=list(train[train_columns].investee_country.unique()),
+                              value=random_sample.investee_country.values[0])
+investee_subregion = gr.Dropdown(label="investee_subregion", choices=list(train[train_columns].investee_subregion.unique()),
+                              value=random_sample.investee_subregion.values[0])
+investee_region = gr.Dropdown(label="investee_region", choices=list(train[train_columns].investee_region.unique()),
+                             value=random_sample.investee_region.values[0])
+investee_vertical = gr.Dropdown(label="investee_vertical", choices=list(train[train_columns].investee_vertical.unique()),
+                            value=random_sample.investee_vertical.values[0])
+investee_industry = gr.Dropdown(label="investee_industry", choices=list(train[train_columns].investee_industry.unique()),
+                            value=random_sample.investee_industry.values[0])
+if pd.isnull(random_sample.round_amount.values[0]):
+    rand_amount = 0
+else:
+    rand_amount = random_sample.round_amount.values[0]
+round_amount = gr.Slider(label="round_amount", minimum=100000, maximum=200000000,
+                         value=rand_amount,
+                         step=100000)
+investee = gr.Textbox(label="investee", value=random_sample.investee.values[0])
+investee_description = gr.Textbox(label="investee_description",
+           value=random_sample.investee_description.values[0])
+investor_list = gr.Textbox(label="investor_list",
+           value=random_sample.investor_list.values[0])
+previous_investors = gr.Textbox(label="previous_investors",
+           value=random_sample.previous_investors.values[0])
+other_values = gr.Dataframe(
+    headers=['key', 'value'],
+    value=[['round_date', random_sample.round_date.values[0]],
+   ['prior_funding', random_sample.prior_funding.values[0]]]
+)
+out = gr.Textbox(max_lines=num_return_sequences)
+with demo:
+    gr.Markdown("Enter funding data to generate news headline.")
+    inputs=[stage, investee_country, investee_subregion, investee_region,
+                      investee_vertical, investee_industry,
+                      round_amount, investee, investee_description, investor_list, previous_investors,
+                      other_values]
+    investee_industry.change(fn=update_industry, inputs=investee_industry, outputs=investee_vertical)
+    investee_vertical.change(fn=update_vertical, inputs=investee_vertical, outputs=investee_industry)
+    gr.Interface(fn=generate_headline, inputs=inputs, outputs=out, live=True)
+    description="Enter funding data to generate news headline.",
+    live=True
+demo.launch(
+    share=False, auth=("123", "123")
+)
+# In[76]:
+demo.close()
+# In[77]:
+gr.close_all()
+# In[ ]:
+# In[ ]:
+# In[ ]:
+# In[ ]:
+# In[ ]:

t5-v1_1-base_tia/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "_name_or_path": "google/t5-v1_1-base",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 768,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "gelu_new",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "gated-gelu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "num_decoder_layers": 12,
+  "num_heads": 12,
+  "num_layers": 12,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.20.1",
+  "use_cache": true,
+  "vocab_size": 32128
+}

t5-v1_1-base_tia/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15c826dfd8c15a7750d1961a646e49f145d669052224b39ac7f5c698422ba13e
+size 990406605

train.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:999febaa4d0e013cb0c89ba43c657bfdf13d9d7d8e52f4050b64341ff833489d
+size 34332589