Spaces:

Hellisotherpeople
/

DebateKG

Running

App Files Files Community

DebateKG / app.py

Hellisotherpeople

Update app.py

d79d1b9 over 2 years ago

raw

history blame

7.35 kB

	import pysbd
	from txtai.embeddings import Embeddings
	import networkx as nx
	from tqdm import tqdm
	from txtai.graph import GraphFactory
	from datasets import load_dataset
	import streamlit as st
	import streamlit.components.v1 as components
	import string


	st.set_page_config(page_title="DebateKG")
	st.title("DebateKG - Automatic Policy Debate Case Creation")
	st.caption("github: https://github.com/Hellisotherpeople/DebateKG")


	form = st.sidebar.form("Main Settings")
	form.header("Main Settings")
	highlight_threshold = form.number_input("Enter the minimum similarity value needed to highlight" , value = 0.05)
	show_extract = form.checkbox("Show extracts", value = True)
	show_abstract = form.checkbox("Show abstract", value = False)
	show_full_doc = form.checkbox("Show full doc", value = False)
	show_citation = form.checkbox("Show citation", value = True)
	rerank_word = form.text_input("(Optional) Constrain all evidence in the case to have this word within its text", value = "")
	form.caption("Doing this may create graphs which are so constrained that DebateKG can't find a valid path in the graph to build a case")
	html_window_width = form.number_input("Enter the pixel width of the output debate case window", value = 1000)
	html_window_height = form.number_input("Enter the pixel height of the output debate case window", value = 1000)
	option = form.selectbox(
	'Which Knowledge Graph do you want to use?',
	('DebateSum_SemanticGraph_longformer_extract.tar.gz', 'DebateSum_SemanticGraph_longformer_abstract.tar.gz', 'DebateSum_SemanticGraph_mpnet_abstract.tar.gz', 'DebateSum_SemanticGraph_legalbert_abstract.tar.gz', 'DebateSum_SemanticGraph_legalbert_extract.tar.gz', 'DebateSum_SemanticGraph_mpnet_extract.tar.gz', 'DebateSum_SemanticGraph_mpnet_sentence.tar.gz'), index = 2)

	form.form_submit_button("Change Settings")

	@st.cache(allow_output_mutation=True)
	def load_my_dataset():
	dataset = load_dataset("Hellisotherpeople/DebateSum", split = "train")
	return dataset

	@st.cache(allow_output_mutation=True)
	def load_embeddings():
	embeddings = Embeddings({
	"path": "sentence-transformers/all-mpnet-base-v2",
	"content": True,
	"functions": [
	{"name": "graph", "function": "graph.attribute"},
	],
	"expressions": [
	{"name": "topic", "expression": "graph(indexid, 'topic')"},
	{"name": "topicrank", "expression": "graph(indexid, 'topicrank')"}
	],
	"graph": {
	"limit": 100,
	"minscore": 0.10,
	"topics": {
	"terms": 4,
	"resolution" : 100
	}
	}
	})
	embeddings.load(option)
	return embeddings

	dataset = load_my_dataset()
	embeddings = load_embeddings()

	graph = embeddings.graph

	def david_distance(source, target, attrs):
	distance = max(1.0 - attrs["weight"], 0.0)
	return distance if distance >= 0.15 else 1.00

	def david_showpath(source, target, the_graph):
	return nx.shortest_path(the_graph, source, target, david_distance)

	def david_show_all_paths(source, target, the_graph):
	return nx.all_shortest_paths(the_graph, source, target, david_distance)


	def highlight(index, result):
	output = f"{index}. "
	spans = [(token, score, "#fff59d" if score > highlight_threshold else None) for token, score in result["tokens"]]

	for token, _, color in spans:
	output += f"<span style='background-color: {color}'>{token}</span> " if color else f"{token} "

	return output



	def showpath_any(list_of_arguments, strip_punctuation = True, the_graph=graph.backend):
	list_of_paths = []
	for x, y in zip(list_of_arguments, list_of_arguments[1:]):
	a_path = david_showpath(x, y, the_graph)
	list_of_paths.extend(a_path)
	#print(list_of_paths)
	path = [graph.attribute(p, "text") for p in list_of_paths]
	list_of_evidence_ids = []
	for text in path:
	if strip_punctuation:
	text = text.translate(str.maketrans("","", string.punctuation))
	list_of_evidence_ids.append(int(embeddings.search(f"select id from txtai where similar('{text}') limit 1")[0]['id']))

	sections = []
	#sections.append(list_of_evidence_ids)
	for x, p in enumerate(path):
	if x == 0:
	# Print start node

	sections.append(f"{x + 1}. {p}")
	if show_abstract:
	sections.append(dataset["Abstract"][list_of_evidence_ids[x]])
	if show_citation:
	sections.append(dataset["Citation"][list_of_evidence_ids[x]])
	if show_extract:
	sections.append(dataset["Extract"][list_of_evidence_ids[x]])
	if show_full_doc:
	sections.append(dataset["Full-Document"][list_of_evidence_ids[x]])

	if x < len(path) - 1:
	# Explain and highlight next path element
	results = embeddings.explain(p, [path[x + 1]], limit=1)[0]
	sections.append(highlight(x + 2, results))
	if show_abstract:
	sections.append(dataset["Abstract"][list_of_evidence_ids[x+1]])
	if show_citation:
	sections.append(dataset["Citation"][list_of_evidence_ids[x+1]])
	if show_extract:
	sections.append(dataset["Extract"][list_of_evidence_ids[x+1]])
	if show_full_doc:
	sections.append(dataset["Full-Document"][list_of_evidence_ids[x+1]])

	return components.html("<br/><br/>".join(sections), scrolling = True, width = html_window_width, height = html_window_height)

	def question(text, rerank_word = "", rerank_topic = "", limit = 100):
	return embeddings.search(f"select id, text, topic, evidence_id, score from txtai where similar('{text}') and text like '%{rerank_word}%' and topic like '%{rerank_topic}%' limit {limit}")



	query_form = st.form("Query the Index:")
	query_form.write("Step 1: Find Arguments")
	query_form.write("Use semantic SQL from txtai to find some arguments, we use indexids to keep track of them.")
	query_form.caption("You can use the semantic SQL to explore the dataset too! The possibilities are limitless!")
	query_sql = query_form.text_area("Enter a semantic SQL statement", value = f"select topic, * from txtai where similar('Trump and US relations with China') and topic like '%trump%' and text like '%china%' limit 1")

	query_form_submitted = query_form.form_submit_button("Query")

	if query_form_submitted:
	with st.expander("Output (Open Me)", expanded = False):
	#my_path = showpath_any([170750, 50, 23])
	#st.write(embeddings.search(f"select * from txtai where similar('you') and text like '%the%' limit 10"))
	st.write(embeddings.search(query_sql))


	paths_form = st.form("Build the Arguments")
	paths_form.write("Step 2: Build a Policy Debate Case")
	paths_form.write("Enter any number of indexids (arguments), DebateKG will build a debate case out of it which links them all together")
	user_paths_string = paths_form.text_area("Enter a list of indexids seperated by whitespace", value = "250 10000 2405")
	user_paths_list_of_strings = user_paths_string.split()
	user_paths_list = list(map(int, user_paths_list_of_strings))

	paths_form_submitted = paths_form.form_submit_button("Build a Policy Debate Case")

	if paths_form_submitted:
	if rerank_word:
	selected_nodes = [n for n,v in graph.backend.nodes(data=True) if rerank_word in v['text']] ##also works for topic
	H = graph.backend.subgraph(selected_nodes)
	showpath_any(user_paths_list, the_graph = H)
	else:
	showpath_any(user_paths_list)