Spaces:
Sleeping
Sleeping
# import all packages | |
import requests | |
import streamlit as st | |
from sklearn.model_selection import StratifiedKFold | |
from sklearn.model_selection import train_test_split | |
from sklearn.model_selection import KFold | |
# tokenizer | |
from transformers import AutoTokenizer, DistilBertTokenizerFast | |
# sequence tagging model + training-related | |
from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments | |
import numpy as np | |
import pandas as pd | |
import torch | |
import json | |
import sys | |
import os | |
#from datasets import load_metric | |
from sklearn.metrics import classification_report | |
from pandas import read_csv | |
from sklearn.linear_model import LogisticRegression | |
import sklearn.model_selection | |
from sklearn.feature_extraction.text import TfidfTransformer | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.model_selection import GridSearchCV | |
from sklearn.pipeline import Pipeline, FeatureUnion | |
import math | |
from sklearn.metrics import accuracy_score | |
from sklearn.metrics import precision_recall_fscore_support | |
from sklearn.model_selection import train_test_split | |
import json | |
import re | |
import numpy as np | |
import pandas as pd | |
import re | |
import nltk | |
#stemmer = nltk.SnowballStemmer("english") | |
#from nltk.corpus import stopwords | |
import string | |
from sklearn.model_selection import train_test_split | |
# import seaborn as sns | |
# from sklearn.metrics import confusion_matrix | |
# from sklearn.metrics import classification_report, ConfusionMatrixDisplay | |
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoConfig | |
import torch | |
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler | |
import itertools | |
import json | |
import glob | |
from transformers import TextClassificationPipeline, TFAutoModelForSequenceClassification, AutoTokenizer | |
from transformers import pipeline | |
import pickle | |
import urllib.request | |
from sklearn.feature_extraction.text import TfidfTransformer | |
from sklearn.feature_extraction.text import CountVectorizer | |
#from PyPDF2 import PdfReader | |
#from urllib.request import urlopen | |
#from tabulate import tabulate | |
import csv | |
import gdown | |
import zipfile | |
import wget | |
import pdfplumber | |
import pathlib | |
import shutil | |
import webbrowser | |
from streamlit.components.v1 import html | |
import streamlit.components.v1 as components | |
from PyPDF2 import PdfReader | |
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
# from git import Repo | |
# Repo.clone_from('https://github.com/gseetha04/IMA-weights.git', branch='master') | |
def main(): | |
st.title("Text to Causal Knowledge Graph") | |
st.sidebar.title("Please upload your text documents in one file here:") | |
k=2 | |
seed = 1 | |
k1= 5 | |
uploaded_file = st.sidebar.file_uploader("Choose a file", type = "pdf") | |
text_list = [] | |
causal_sents = [] | |
reader = PdfReader(uploaded_file) | |
for page in reader.pages: | |
text = page.extract_text() | |
text_list.append(text) | |
text_list_final = [x.replace('\n', '') for x in text_list] | |
text_list_final = re.sub('"', '', str(text_list_final)) | |
sentences = nltk.sent_tokenize(text_list_final) | |
result =[] | |
for i in sentences: | |
result1 = i.lower() | |
result2 = re.sub(r'[^\w\s]','',result1) | |
result.append(result2) | |
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") | |
model_path = "checkpoint-2850" | |
model = AutoModelForSequenceClassification.from_pretrained(model_path,id2label={0:'non-causal',1:'causal'}) | |
pipe1 = pipeline("text-classification", model=model,tokenizer=tokenizer) | |
for sent in result: | |
pred = pipe1(sent) | |
for lab in pred: | |
if lab['label'] == 'causal': #causal | |
causal_sents.append(sent) | |
model_name = "distilbert-base-uncased" | |
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) | |
model_path1 = "DistilBertforTokenClassification" | |
model = DistilBertForTokenClassification.from_pretrained(model_path1, id2label={0:'CT',1:'E',2:'C',3:'O'}) #len(unique_tags),, num_labels= 7, | |
pipe = pipeline('ner', model=model, tokenizer=tokenizer,aggregation_strategy='simple') #grouped_entities=True | |
sentence_pred = [] | |
class_list = [] | |
entity_list = [] | |
for k in causal_sents: | |
pred= pipe(k) | |
#st.write(pred) | |
for i in pred: | |
sentence_pred.append(k) | |
class_list.append(i['word']) | |
entity_list.append(i['entity_group']) | |
filename = 'Checkpoint-classification.sav' | |
count_vect = CountVectorizer(ngram_range=[1,3]) | |
tfidf_transformer=TfidfTransformer() | |
loaded_model = pickle.load(open(filename, 'rb')) | |
loaded_vectorizer = pickle.load(open('vectorizefile_classification.pickle', 'rb')) | |
pipeline_test_output = loaded_vectorizer.transform(class_list) | |
predicted = loaded_model.predict(pipeline_test_output) | |
pred1 = predicted | |
level0 = [] | |
count =0 | |
for i in predicted: | |
if i == 3: | |
level0.append('Non-Performance') | |
count +=1 | |
else: | |
level0.append('Performance') | |
count +=1 | |
list_pred = {0: 'Customers',1:'Employees',2:'Investors',3:'Non-performance',4:'Society',5:'Unclassified'} | |
pred_val = [list_pred[i] for i in pred1] | |
#print('count',count) | |
sent_id, unique = pd.factorize(sentence_pred) | |
final_list = pd.DataFrame( | |
{'Id': sent_id, | |
'Full sentence': sentence_pred, | |
'Component': class_list, | |
'cause/effect': entity_list, | |
'Label_level1': level0, | |
'Label_level2': pred_val | |
}) | |
s = final_list['Component'].shift(-1) | |
m = s.str.startswith('##', na=False) | |
final_list.loc[m, 'Component'] += (' ' + s[m]) | |
final_list1 = final_list[~final_list['Component'].astype(str).str.startswith('##')] | |
li = [] | |
uni = final_list1['Id'].unique() | |
for i in uni: | |
df_new = final_list1[final_list1['Id'] == i] | |
uni1 = df_new['Id'].unique() | |
if 'E' not in df_new.values: | |
li.append(uni1) | |
out = np.concatenate(li).ravel() | |
li_pan = pd.DataFrame(out,columns=['Id']) | |
df3 = pd.merge(final_list1, li_pan[['Id']], on='Id', how='left', indicator=True) \ | |
.query("_merge == 'left_only'") \ | |
.drop('_merge',1) | |
df = df3.groupby(['Id','Full sentence','cause/effect', 'Label_level1', 'Label_level2'])['Component'].apply(', '.join).reset_index() | |
df["cause/effect"].replace({"C": "cause", "E": "effect"}, inplace=True) | |
df_final = df[df['cause/effect'] != 'CT'] | |
df['New string'] = df_final['Component'].replace(r'[##]+', ' ', regex=True) | |
df_final = df_final.drop('Component',1) | |
df_final.insert(2, "Component", df['New string'], True) | |
df_final.to_csv('predictions.csv') | |
count_NP_NP = 0 | |
count_NP_investor = 0 | |
count_NP_customer = 0 | |
count_NP_employees = 0 | |
count_NP_society = 0 | |
count_inv_np = 0 | |
count_inv_investor = 0 | |
count_inv_customer = 0 | |
count_inv_employee = 0 | |
count_inv_society = 0 | |
count_cus_np = 0 | |
count_cus_investor = 0 | |
count_cus_customer = 0 | |
count_cus_employee = 0 | |
count_cus_society = 0 | |
count_emp_np = 0 | |
count_emp_investor = 0 | |
count_emp_customer = 0 | |
count_emp_employee = 0 | |
count_emp_society = 0 | |
count_soc_np = 0 | |
count_soc_investor = 0 | |
count_soc_customer = 0 | |
count_soc_employee = 0 | |
count_soc_society = 0 | |
for i in range(0,df_final['Id'].max()): | |
j = df_final.loc[df_final['Id'] == i] | |
cause_tab = j.loc[j['cause/effect'] == 'cause'] | |
effect_tab = j.loc[j['cause/effect'] == 'effect'] | |
cause_coun_NP = (cause_tab.Label_level2 == 'Non-performance').sum() | |
effect_coun_NP = (effect_tab.Label_level2 == 'Non-performance').sum() | |
if (cause_coun_NP > 0) and (effect_coun_NP > 0): | |
count_NP = cause_coun_NP if cause_coun_NP >= effect_coun_NP else effect_coun_NP | |
else: | |
count_NP = 0 | |
effect_NP_inv = (effect_tab.Label_level2 == 'Investors').sum() | |
if (cause_coun_NP > 0) and (effect_NP_inv > 0): | |
count_NP_inv = cause_coun_NP if cause_coun_NP >= effect_NP_inv else effect_NP_inv | |
else: | |
count_NP_inv = 0 | |
effect_NP_cus = (effect_tab.Label_level2 == 'Customers').sum() | |
if (cause_coun_NP > 0) and (effect_NP_cus > 0): | |
count_NP_cus = cause_coun_NP if cause_coun_NP >= effect_NP_cus else effect_NP_cus | |
else: | |
count_NP_cus = 0 | |
effect_NP_emp = (effect_tab.Label_level2 == 'Employees').sum() | |
if (cause_coun_NP > 0) and (effect_NP_emp > 0): | |
count_NP_emp = cause_coun_NP if cause_coun_NP >= effect_NP_emp else effect_NP_emp | |
else: | |
count_NP_emp = 0 | |
effect_NP_soc = (effect_tab.Label_level2 == 'Society').sum() | |
if (cause_coun_NP > 0) and (effect_NP_soc > 0): | |
count_NP_soc = cause_coun_NP if cause_coun_NP >= effect_NP_soc else effect_NP_soc | |
else: | |
count_NP_soc = 0 | |
cause_coun_inv = (cause_tab.Label_level2 == 'Investors').sum() | |
effect_coun_inv = (effect_tab.Label_level2 == 'Non-performance').sum() | |
if (cause_coun_inv > 0) and (effect_coun_inv > 0): | |
count_NP_inv = cause_coun_inv if cause_coun_inv >= effect_coun_inv else effect_coun_inv | |
else: | |
count_NP_inv = 0 | |
effect_inv_inv = (effect_tab.Label_level2 == 'Investors').sum() | |
if (cause_coun_inv > 0) and (effect_inv_inv > 0): | |
count_inv_inv = cause_coun_inv if cause_coun_inv >= effect_inv_inv else effect_inv_inv | |
else: | |
count_inv_inv = 0 | |
effect_inv_cus = (effect_tab.Label_level2 == 'Customers').sum() | |
if (cause_coun_inv > 0) and (effect_inv_cus > 0): | |
count_inv_cus = cause_coun_inv if cause_coun_inv >= effect_inv_cus else effect_inv_cus | |
else: | |
count_inv_cus = 0 | |
effect_inv_emp = (effect_tab.Label_level2 == 'Employees').sum() | |
if (cause_coun_inv > 0) and (effect_inv_emp > 0): | |
count_inv_emp = cause_coun_inv if cause_coun_inv >= effect_inv_emp else effect_inv_emp | |
else: | |
count_inv_emp = 0 | |
effect_inv_soc = (effect_tab.Label_level2 == 'Society').sum() | |
if (cause_coun_inv > 0) and (effect_inv_soc > 0): | |
count_inv_soc = cause_coun_inv if cause_coun_inv >= effect_inv_soc else effect_inv_soc | |
else: | |
count_inv_soc = 0 | |
cause_coun_cus = (cause_tab.Label_level2 == 'Customers').sum() | |
effect_coun_cus = (effect_tab.Label_level2 == 'Non-performance').sum() | |
if (cause_coun_cus > 0) and (effect_coun_cus > 0): | |
count_NP_cus = cause_coun_cus if cause_coun_cus >= effect_coun_cus else effect_coun_cus | |
else: | |
count_NP_cus = 0 | |
effect_cus_inv = (effect_tab.Label_level2 == 'Investors').sum() | |
if (cause_coun_cus > 0) and (effect_cus_inv > 0): | |
count_cus_inv = cause_coun_cus if cause_coun_cus >= effect_cus_inv else effect_cus_inv | |
else: | |
count_cus_inv = 0 | |
effect_cus_cus = (effect_tab.Label_level2 == 'Customers').sum() | |
if (cause_coun_cus > 0) and (effect_cus_cus > 0): | |
count_cus_cus = cause_coun_cus if cause_coun_cus >= effect_cus_cus else effect_cus_cus | |
else: | |
count_cus_cus = 0 | |
effect_cus_emp = (effect_tab.Label_level2 == 'Employees').sum() | |
if (cause_coun_cus > 0) and (effect_cus_emp > 0): | |
count_cus_emp = cause_coun_cus if cause_coun_cus >= effect_cus_emp else effect_cus_emp | |
else: | |
count_cus_emp = 0 | |
effect_cus_soc = (effect_tab.Label_level2 == 'Society').sum() | |
if (cause_coun_cus > 0) and (effect_cus_soc > 0): | |
count_cus_soc = cause_coun_cus if cause_coun_cus >= effect_cus_soc else effect_cus_soc | |
else: | |
count_cus_soc = 0 | |
cause_coun_emp = (cause_tab.Label_level2 == 'Employees').sum() | |
effect_coun_emp = (effect_tab.Label_level2 == 'Non-performance').sum() | |
if (cause_coun_emp > 0) and (effect_coun_emp > 0): | |
count_NP_emp = cause_coun_emp if cause_coun_emp >= effect_coun_emp else effect_coun_emp | |
else: | |
count_NP_emp = 0 | |
effect_emp_inv = (effect_tab.Label_level2 == 'Investors').sum() | |
if (cause_coun_emp > 0) and (effect_emp_inv > 0): | |
count_emp_inv = cause_coun_emp if cause_coun_emp >= effect_emp_inv else effect_emp_inv | |
else: | |
count_emp_inv = 0 | |
effect_emp_cus = (effect_tab.Label_level2 == 'Customers').sum() | |
if (cause_coun_emp > 0) and (effect_emp_cus > 0): | |
count_emp_cus = cause_coun_emp if cause_coun_emp >= effect_emp_cus else effect_emp_cus | |
else: | |
count_emp_cus = 0 | |
effect_emp_emp = (effect_tab.Label_level2 == 'Employees').sum() | |
if (cause_coun_emp > 0) and (effect_emp_emp > 0): | |
count_emp_emp = cause_coun_emp if cause_coun_emp >= effect_emp_emp else effect_emp_emp | |
else: | |
count_emp_emp = 0 | |
effect_emp_soc = (effect_tab.Label_level2 == 'Society').sum() | |
if (cause_coun_emp > 0) and (effect_emp_soc > 0): | |
count_emp_soc = cause_coun_emp if cause_coun_emp >= effect_emp_soc else effect_emp_soc | |
else: | |
count_emp_soc = 0 | |
cause_coun_soc = (cause_tab.Label_level2 == 'Society').sum() | |
effect_coun_soc = (effect_tab.Label_level2 == 'Non-performance').sum() | |
if (cause_coun_soc > 0) and (effect_coun_soc > 0): | |
count_NP_soc = cause_coun_soc if cause_coun_soc >= effect_coun_soc else effect_coun_soc | |
else: | |
count_NP_soc = 0 | |
effect_soc_inv = (effect_tab.Label_level2 == 'Investors').sum() | |
if (cause_coun_soc > 0) and (effect_soc_inv > 0): | |
count_soc_inv = cause_coun_soc if cause_coun_soc >= effect_soc_inv else effect_soc_inv | |
else: | |
count_soc_inv = 0 | |
effect_soc_cus = (effect_tab.Label_level2 == 'Customers').sum() | |
if (cause_coun_soc > 0) and (effect_soc_cus > 0): | |
count_soc_cus = cause_coun_soc if cause_coun_soc >= effect_soc_cus else effect_soc_cus | |
else: | |
count_soc_cus = 0 | |
effect_soc_emp = (effect_tab.Label_level2 == 'Employees').sum() | |
if (cause_coun_soc > 0) and (effect_soc_emp > 0): | |
count_soc_emp = cause_coun_soc if cause_coun_soc >= effect_soc_emp else effect_soc_emp | |
else: | |
count_soc_emp = 0 | |
effect_soc_soc = (effect_tab.Label_level2 == 'Society').sum() | |
if (cause_coun_soc > 0) and (effect_soc_soc > 0): | |
count_soc_soc = cause_coun_soc if cause_coun_soc >= effect_soc_soc else effect_soc_soc | |
else: | |
count_soc_soc = 0 | |
count_NP_NP = count_NP_NP + count_NP | |
count_NP_investor = count_NP_investor + count_NP_inv | |
count_NP_customer = count_NP_customer + count_NP_cus | |
count_NP_employees = count_NP_employees + count_NP_emp | |
count_NP_society = count_NP_society + count_NP_soc | |
count_inv_np = count_inv_np + count_NP_inv | |
count_inv_investor = count_inv_investor + count_inv_inv | |
count_inv_customer = count_inv_customer + count_inv_cus | |
count_inv_employee = count_inv_employee + count_inv_emp | |
count_inv_society = count_inv_society + count_inv_soc | |
count_cus_np = count_cus_np + count_NP_cus | |
count_cus_investor = count_cus_investor + count_cus_inv | |
count_cus_customer = count_cus_customer + count_cus_cus | |
count_cus_employee = count_cus_employee + count_cus_emp | |
count_cus_society = count_cus_society + count_cus_soc | |
count_emp_np = count_emp_np + count_NP_emp | |
count_emp_investor = count_emp_investor + count_emp_inv | |
count_emp_customer = count_emp_customer + count_emp_cus | |
count_emp_employee = count_emp_employee + count_emp_emp | |
count_emp_society = count_emp_society + count_emp_soc | |
count_soc_np = count_soc_np + count_NP_soc | |
count_soc_investor = count_soc_investor + count_soc_inv | |
count_soc_customer = count_soc_customer + count_soc_cus | |
count_soc_employee = count_soc_employee + count_soc_emp | |
count_soc_society = count_soc_society + count_soc_soc | |
df_tab = pd.DataFrame(columns = ['Non-performance', 'Investors', 'Customers', 'Employees', 'Society'],index=['Non-performance', 'Investors', 'Customers', 'Employees', 'Society'], dtype=object) | |
df_tab.loc['Non-performance'] = [count_NP_NP, count_NP_investor, count_NP_customer, count_NP_employees, count_NP_society] | |
df_tab.loc['Investors'] = [count_inv_np, count_inv_investor, count_inv_customer, count_inv_employee, count_inv_society] | |
df_tab.loc['Customers'] = [count_cus_np, count_cus_investor, count_cus_customer, count_cus_employee, count_cus_society] | |
df_tab.loc['Employees'] = [count_emp_np, count_emp_investor, count_emp_customer, count_emp_employee, count_emp_society] | |
df_tab.loc['Society'] = [count_soc_np, count_soc_investor, count_soc_customer, count_soc_employee, count_soc_society] | |
# df_tab = pd.DataFrame({ | |
# 'Non-performance': [count_NP_NP, count_NP_investor, count_NP_customer, count_NP_employees, count_NP_society], | |
# 'Investors': [count_inv_np, count_inv_investor, count_inv_customer, count_inv_employee, count_inv_society], | |
# 'Customers': [count_cus_np, count_cus_investor, count_cus_customer, count_cus_employee, count_cus_society], | |
# 'Employees': [count_emp_np, count_emp_investor, count_emp_customer, count_emp_employee, count_emp_society], | |
# 'Society': [count_soc_np, count_soc_investor, count_soc_customer, count_soc_employee, count_soc_society]}, | |
# index=['Non-performance', 'Investors', 'Customers', 'Employees', 'Society']) | |
df_tab.to_csv('final_data.csv') | |
df = pd.read_csv('final_data.csv', index_col=0) | |
# Convert to JSON format | |
json_data = [] | |
for row in df.index: | |
for col in df.columns: | |
json_data.append({ | |
'source': row, | |
'target': col, | |
'value': int(df.loc[row, col]) | |
}) | |
# Write JSON to file | |
with open('smalljson.json', 'w') as f: | |
json.dump(json_data, f) | |
csv_file = "predictions.csv" | |
json_file = "ch.json" | |
# Open the CSV file and read the data | |
with open(csv_file, "r") as f: | |
csv_data = csv.DictReader(f) | |
# Convert the CSV data to a list of dictionaries | |
data_list = [] | |
for row in csv_data: | |
data_list.append(dict(row)) | |
# Convert the list of dictionaries to JSON | |
json_data = json.dumps(data_list) | |
# Write the JSON data to a file | |
with open(json_file, "w") as f: | |
f.write(json_data) | |
def convert_df(df): | |
#IMPORTANT: Cache the conversion to prevent computation on every rerun | |
return df.to_csv().encode('utf-8') | |
csv1 = convert_df(df_final.astype(str)) | |
csv2 = convert_df(df_tab.astype(str)) | |
with st.container(): | |
st.download_button(label="Download the detailed result table",data=csv1,file_name='results.csv',mime='text/csv') | |
st.download_button(label="Download the result table",data=csv2,file_name='final_data.csv',mime='text/csv') | |
# # LINK TO THE CSS FILE | |
# def tree_css(file_name): | |
# with open('/Users/seetha/Downloads/tree.css')as f: | |
# st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html = True) | |
# | |
# def div_css(file_name): | |
# with open('/Users/seetha/Downloads/div.css')as f: | |
# st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html = True) | |
# | |
# def side_css(file_name): | |
# with open('/Users/seetha/Downloads/side.css')as f: | |
# st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html = True) | |
# | |
# tree_css('tree.css') | |
# div_css('div.css') | |
# side_css('side.css') | |
STREAMLIT_STATIC_PATH = pathlib.Path(st.__path__[0]) / 'static' | |
CSS_PATH = (STREAMLIT_STATIC_PATH / "css1") | |
if not CSS_PATH.is_dir(): | |
CSS_PATH.mkdir() | |
css_file = CSS_PATH / "tree.css" | |
css_file1 = CSS_PATH / "div.css" | |
css_file2 = CSS_PATH / "side.css" | |
jso_file = CSS_PATH / "smalljson.json" | |
if not css_file.exists(): | |
shutil.copy("tree.css", css_file) | |
shutil.copy("div.css", css_file1) | |
shutil.copy("side.css", css_file2) | |
shutil.copy("smalljson.json", jso_file) | |
HtmlFile = open("index.html", 'r', encoding='utf-8') | |
source_code = HtmlFile.read() | |
#print(source_code) | |
components.html(source_code) | |
# # Define your javascript | |
# my_js = """ | |
# alert("Hello World"); | |
# """ | |
# Wrapt the javascript as html code | |
#my_html = f"<script>{my_js}</script>" | |
# with st.container(): | |
# # Execute your app | |
# st.title("Visualization example") | |
# # components.html(source_code) | |
# #html(my_html) | |
# #webbrowser.open('https://webpages.charlotte.edu/ltotapal/') | |
# # embed streamlit docs in a streamlit app | |
# #components.iframe("https://webpages.charlotte.edu/ltotapal/") | |
# st.markdown('<a href="https://webpages.charlotte.edu/ltotapal/" target="_self">Text to Knowledge graph link</a>', unsafe_allow_html=True) | |
if __name__ == '__main__': | |
main() | |