Spaces:
Runtime error
Runtime error
from nltk.stem import WordNetLemmatizer | |
import pandas as pd | |
import numpy as np | |
import math | |
import nltk | |
import re | |
nltk.download("wordnet") | |
# nltk.download("omw-1.4") | |
# Initialize wordnet lemmatizer | |
wnl = WordNetLemmatizer() | |
file1 = './assets/text1.txt' | |
file2 = './assets/text2.txt' | |
file3 = './assets/text3.txt' | |
file4 = './assets/text4.txt' | |
file5 = './assets/text5.txt' | |
file6 = './assets/text6.txt' | |
file7 = './assets/text7.txt' | |
file8 = './assets/text8.txt' | |
file9 = './assets/text9.txt' | |
file10 = './assets/text10.txt' | |
files = [file1, file2, file3,file4,file5,file6,file7,file8,file9,file10] | |
gist_file = open("gist_stopwords.txt", "r") | |
try: | |
content = gist_file.read() | |
stopwords = content.split(",") | |
finally: | |
gist_file.close() | |
def read_file(name): | |
with open(name,'r') as file: | |
contents = file.read(); | |
return contents | |
def process_string(name): | |
text = ''.join(c.lower() for c in name) | |
# remove punctuation using regex that matches only words or digits or underscore of length 1 or more | |
tokens = re.findall(r'\w+', text) | |
# remove commonly used words like 'is', 'the', 'a', etc. | |
filtered_tokens = [token for token in tokens if token not in stopwords] | |
# convert words to their root form ie 'running' to 'run' | |
root_tokens = [wnl.lemmatize(token,pos='n') for token in filtered_tokens] | |
return root_tokens | |
def process_tokens(tokens,st_global_words): | |
# global st_global_words | |
freq_dict = {} | |
tf_dict = {} | |
for word in st_global_words: | |
freq_dict[word] = tokens.count(word) | |
tf_dict[word] = freq_dict[word]/len(tokens) | |
return freq_dict, tf_dict | |
def main(input1,input2): | |
processed_files = [ read_file(file) for file in files ] | |
processed_files.insert(0,input2) | |
processed_files.insert(0,input1) | |
processed_strings = [ process_string(file) for file in processed_files ] | |
st_global_words = set() | |
for tokens in processed_strings: | |
st_global_words.update(tokens) | |
processed_tokens = [] | |
for tokens in processed_strings: | |
freq_dict, tf_dict = process_tokens(tokens,st_global_words) | |
processed_tokens.append((freq_dict, tf_dict)) | |
idf_dict = {} | |
for word in st_global_words: | |
cnt = 0 | |
for freq_dict, tf_dict in processed_tokens: | |
if freq_dict[word] > 0: | |
cnt += 1 | |
idf_dict[word] = math.log(len(processed_tokens)/cnt) | |
df = pd.DataFrame({'word': list(st_global_words)}) | |
df['idf_col']= [idf_dict[word] for word in st_global_words] | |
for i, (freq_dict, tf_dict) in enumerate(processed_tokens): | |
freq_col = [freq_dict[word] for word in st_global_words] | |
tf_col = [tf_dict[word] for word in st_global_words] | |
df['freq_{}'.format(i+1)] = freq_col | |
df['tf_{}'.format(i+1)] = tf_col | |
df[f'tfidf_{i+1}'] = df[f'tf_{i+1}'] * df['idf_col'] | |
tf_idf_cols = [col for col in df.columns if 'tfidf' in col] | |
tf_idf_vals = [] | |
for i in range(len(tf_idf_cols)): | |
tf_idf_vals.append(df[tf_idf_cols[i]].values) | |
tf_idf_vals = np.array(tf_idf_vals) | |
return tf_idf_vals | |
def cosine_diff(A,B): | |
dot_product = sum(A[i]*B[i] for i in range(len(A))) | |
norm_A = math.sqrt(sum([A[i]**2 for i in range(len(A))])) | |
norm_B = math.sqrt(sum([B[i]**2 for i in range(len(B))])) | |
similarity = dot_product / (norm_A * norm_B) | |
return similarity | |
def euclidean(A,B): | |
su = 0 | |
for i in range(len(A)): | |
su += (A[i]-B[i])**2 | |
return math.sqrt(su) | |
def final_main(input1,input2): | |
tf_idf_vals = main(input1,input2) | |
outputString = "" | |
similarity = cosine_diff(tf_idf_vals[0],tf_idf_vals[1]) | |
outputString+=f"Cosine similarity:{round(similarity*100,2)}%\n" | |
diff = euclidean(tf_idf_vals[0],tf_idf_vals[1]) | |
outputString += f"Euclidean Distance(difference): {round(math.sqrt(diff)*100,2)}%\n" | |
return outputString | |