|
import dataclasses
|
|
import io
|
|
import json
|
|
import os
|
|
import sys
|
|
import glob
|
|
import random
|
|
import re
|
|
import tarfile
|
|
import datetime
|
|
from collections import defaultdict
|
|
from typing import Dict, List, Optional
|
|
import functools
|
|
import requests
|
|
import tempfile
|
|
|
|
import lxml.etree as ET
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
import gensim
|
|
import spacy
|
|
|
|
import nltk
|
|
from nltk.corpus import framenet as fn
|
|
from nltk.corpus.reader.framenet import FramenetError
|
|
|
|
from flask import Flask, request, render_template, jsonify, redirect, abort, session, url_for
|
|
|
|
from sociofillmore.common.analyze_text import (
|
|
FrameStructure,
|
|
get_syntax_info,
|
|
is_at_root,
|
|
process_prediction_file,
|
|
POSSIBLE_CONSTRUCTIONS,
|
|
SYNTAX_ANALYSIS_CACHE_FILES,
|
|
enrich_texts_df,
|
|
read_frames_of_interest,
|
|
load_deep_frames_cache,
|
|
get_text_meta,
|
|
analyze_single_document,
|
|
get_tarball_blocks,
|
|
analyze_external_file
|
|
)
|
|
from sociofillmore.crashes.utils import is_a_dutch_text
|
|
|
|
|
|
|
|
if sys.argv[2] != "local":
|
|
nltk.download("framenet_v17", download_dir="/nltk_data")
|
|
nltk.download("punkt", download_dir="/nltk_data")
|
|
print("Done!")
|
|
|
|
|
|
PROTECTED_DATASETS = []
|
|
if os.path.exists("secrets.json"):
|
|
with open("secrets.json", encoding="utf-8") as f:
|
|
secrets = json.load(f)
|
|
AUTH_KEY = secrets["auth_key"]
|
|
PASSWORD = secrets["password"]
|
|
SECRET_KEY = bytes(secrets["flask_secret_key"], "utf-8")
|
|
else:
|
|
AUTH_KEY = os.environ.get("AUTH_KEY")
|
|
PASSWORD = os.environ.get("PASSWORD")
|
|
SECRET_KEY = os.environ.get("FLASK_SECRET_KEY")
|
|
|
|
|
|
print("Defining app...")
|
|
app = Flask(__name__)
|
|
app.secret_key = SECRET_KEY
|
|
|
|
|
|
def load_gensim_model(limit):
|
|
print("Loading GENSIM model... [this can take a few minutes]")
|
|
return gensim.models.word2vec.KeyedVectors.load_word2vec_format("data/embeddings/concat_glove_frames.w2v.txt", limit=limit)
|
|
|
|
gensim_m = None
|
|
gensim_m = load_gensim_model(100_000)
|
|
|
|
print("Loading SpaCy models...")
|
|
spacy_model_ud = spacy.load("xx_sent_ud_sm")
|
|
spacy_model_langs = {
|
|
"it": spacy.load("it_core_news_md"),
|
|
"nl": spacy.load("nl_core_news_md"),
|
|
"en": spacy.load("en_core_web_md")
|
|
}
|
|
|
|
|
|
|
|
|
|
frame_freq_cache = {}
|
|
|
|
with open("resources/fn_frames_to_roles.json", encoding="utf-8") as f:
|
|
fn_frames_to_roles = json.load(f)
|
|
|
|
|
|
VICTIM_AGE_GROUPS = ["0-12", "12-18", "18-30", "30-50", "50-70", "70-120"]
|
|
ALL_FOREIGN_NATIONALITIES = "estero (tutto)"
|
|
|
|
|
|
deep_frames_cache = load_deep_frames_cache()
|
|
|
|
def read_rai_provider_attrs():
|
|
df = pd.read_excel("resources/RAI_sources_mr.xlsx")
|
|
|
|
return {
|
|
"politics:man:left": df[df["politics_man"] == "L"]["source"].unique().tolist(),
|
|
"politics:tc:left": df[df["politics_tc_cat"] == "L"]["source"]
|
|
.unique()
|
|
.tolist(),
|
|
"politics:agg:left": df[df["politics_agg"] == "L"]["source"].unique().tolist(),
|
|
"politics:man:right": df[df["politics_man"] == "R"]["source"].unique().tolist(),
|
|
"politics:tc:right": df[df["politics_tc_cat"] == "R"]["source"]
|
|
.unique()
|
|
.tolist(),
|
|
"politics:agg:right": df[df["politics_agg"] == "R"]["source"].unique().tolist(),
|
|
"politics:man:neutral": df[df["politics_man"] == "N"]["source"]
|
|
.unique()
|
|
.tolist(),
|
|
"politics:tc:neutral": df[df["politics_tc_cat"] == "N"]["source"]
|
|
.unique()
|
|
.tolist(),
|
|
"politics:agg:neutral": df[df["politics_agg"] == "N"]["source"]
|
|
.unique()
|
|
.tolist(),
|
|
"type:agency": df[df["type"] == "A"]["source"].unique().tolist(),
|
|
"type:outlet": df[df["type"] == "OUTLET"]["source"].unique().tolist(),
|
|
"area:national": df[df["area"] == "nat"]["source"].unique().tolist(),
|
|
"area:regional": df[df["area"] == "loc"]["source"].unique().tolist(),
|
|
}
|
|
|
|
def read_migration_provider_attrs():
|
|
df = pd.read_csv("data/migration/provider_pol_rel_ratings.csv")
|
|
|
|
return {
|
|
"politics:sc:left": df[df["political_stance"] == -1]["provider"].unique().tolist(),
|
|
"politics:sc:right": df[df["political_stance"] == 1]["provider"].unique().tolist(),
|
|
"politics:sc:neutral": df[df["political_stance"] == 0]["provider"].unique().tolist(),
|
|
"religion:sc:catholic": df[df["religious"] == True]["provider"].unique().tolist(),
|
|
"religion:sc:non_catholic": df[df["religious"] == False]["provider"].unique().tolist()
|
|
}
|
|
|
|
|
|
def read_crashes_provider_attrs():
|
|
df = pd.read_csv("resources/crashes_sources.csv")
|
|
|
|
|
|
df = df.dropna(subset=["ProviderNameCorr"])
|
|
|
|
|
|
name_map = {
|
|
row["ProviderName"]: row["ProviderNameCorr"].strip('"')
|
|
for _, row in df.iterrows()
|
|
}
|
|
|
|
|
|
df = df.groupby(list(df.columns[2:11]))["ProviderFreq"].apply(sum).reset_index()
|
|
|
|
|
|
df = df.assign(**{"Province": df["Province"].str.split("|")}).explode("Province")
|
|
|
|
attr_map = {
|
|
f"{col}:{val}": df[df[col] == val]["ProviderNameCorr"].unique().tolist()
|
|
for col in df.columns[1:9]
|
|
for val in set(df[col].values)
|
|
if val != "-"
|
|
}
|
|
|
|
return attr_map, name_map
|
|
|
|
PROVIDER_ATTRS = {
|
|
"femicides/rai": read_rai_provider_attrs(),
|
|
"femicides/olv": {},
|
|
"crashes/thecrashes": read_crashes_provider_attrs()[0],
|
|
"migration/pavia": read_migration_provider_attrs()
|
|
}
|
|
|
|
|
|
|
|
def get_dataset_variables(dataset_name):
|
|
if dataset_name == "femicides/rai":
|
|
spacy_model = "it_core_news_md"
|
|
elif dataset_name == "femicides/olv":
|
|
spacy_model = "it_core_news_md"
|
|
elif dataset_name == "crashes/thecrashes":
|
|
spacy_model = "nl_core_news_md"
|
|
elif dataset_name == "migration/pavia":
|
|
spacy_model = "it_core_news_md"
|
|
else:
|
|
raise ValueError("Unsupported dataset!")
|
|
|
|
return {
|
|
"dataset": dataset_name,
|
|
"frames": read_frames_of_interest(dataset_name),
|
|
"spacy_model": spacy_model,
|
|
}
|
|
|
|
|
|
|
|
|
|
def load_event_data(dataset):
|
|
if dataset == "femicides/rai":
|
|
event_data_file = "output/femicides/split_data/rai/split_ALL.events.csv"
|
|
texts_data_file = "output/femicides/split_data/rai/split_ALL.texts.meta.csv"
|
|
elif dataset == "femicides/olv":
|
|
event_data_file = "output/femicides/split_data/olv/split_dev10.events.csv"
|
|
texts_data_file = "output/femicides/split_data/olv/split_dev10.texts.csv"
|
|
elif dataset == "crashes/thecrashes":
|
|
event_data_file = "output/crashes/split_data/split_dev10.events.csv"
|
|
texts_data_file = "output/crashes/split_data/split_dev10.texts.meta.csv"
|
|
elif dataset == "migration/pavia":
|
|
event_data_file = "output/migration/split_data/split_dev10.events.csv"
|
|
texts_data_file = "output/migration/split_data/split_dev10.texts.meta.csv"
|
|
else:
|
|
raise ValueError("Unsupported dataset")
|
|
events = pd.read_csv(event_data_file, dtype={"event:id": int}, index_col=0)
|
|
texts = enrich_texts_df(pd.read_csv(texts_data_file, index_col=0), events)
|
|
return {"events_df": events, "texts_df": texts}
|
|
|
|
|
|
DATASETS = {
|
|
"femicides/rai": load_event_data("femicides/rai"),
|
|
"femicides/olv": load_event_data("femicides/olv"),
|
|
"crashes/thecrashes": load_event_data("crashes/thecrashes"),
|
|
"migration/pavia": load_event_data("migration/pavia"),
|
|
}
|
|
|
|
SKIP_FUNCTIONS = {
|
|
"femicides/rai": None,
|
|
"femicides/olv": None,
|
|
"crashes/thecrashes": lambda doc: not is_a_dutch_text(doc),
|
|
"migration/pavia": None
|
|
}
|
|
|
|
|
|
def read_frames_to_event_roles(dataset):
|
|
if dataset == "femicides/rai":
|
|
ftr_df = pd.read_csv("resources/femicides_frame_to_roles.csv")
|
|
if dataset == "femicides/olv":
|
|
ftr_df = pd.read_csv("resources/femicides_frame_to_roles.csv")
|
|
elif dataset == "crashes/thecrashes":
|
|
ftr_df = pd.read_csv("resources/crashes_frame_to_roles.csv")
|
|
else:
|
|
raise ValueError("Unsupported dataset!")
|
|
|
|
frames_to_event_roles: Dict[str, Dict[str, List[str]]] = {}
|
|
role_types = [col for col in ftr_df.columns if col.startswith("role:")]
|
|
for _, row in ftr_df.iterrows():
|
|
frame_roles = defaultdict(list)
|
|
for rt in role_types:
|
|
role_key = rt.split(":")[1]
|
|
if row[rt] == "-":
|
|
frame_roles[role_key] = []
|
|
else:
|
|
for role in row[rt].split("|"):
|
|
frame_roles[role_key].append(role)
|
|
frames_to_event_roles[row["frame"]] = frame_roles
|
|
return frames_to_event_roles
|
|
|
|
|
|
def get_role_expressions(
|
|
struct: FrameStructure,
|
|
roles_dep_map: Dict[int, Dict[str, str]],
|
|
frame_to_role_map: Optional[Dict[str, Dict[str, List[str]]]],
|
|
depth_filter: int,
|
|
output_depth_only: bool = False,
|
|
) -> str:
|
|
|
|
role_exps = []
|
|
role_deps = roles_dep_map[struct.target.tokens_idx[0]]
|
|
|
|
def make_exp(_role, _dep, _depth):
|
|
if output_depth_only:
|
|
return _role + "::" + str(_depth)
|
|
else:
|
|
if _depth > depth_filter:
|
|
_dep = None
|
|
return _role + "::" + (_dep or "_UNK_DEP")
|
|
|
|
|
|
if frame_to_role_map is None:
|
|
for role, _ in struct.roles:
|
|
dep, depth = role_deps.get(role, (None, -1))
|
|
role_exps.append(make_exp(role, dep, depth))
|
|
elif struct.frame in frame_to_role_map:
|
|
for role_type, rt_roles in frame_to_role_map[struct.frame].items():
|
|
for role in rt_roles:
|
|
if role in [r[0] for r in struct.roles]:
|
|
dep, depth = role_deps.get(role, (None, -1))
|
|
role_exps.append(make_exp(role, dep, depth))
|
|
|
|
|
|
|
|
return role_exps
|
|
|
|
|
|
def get_analyze_frame_samples(
|
|
construction,
|
|
frame,
|
|
dependency,
|
|
role,
|
|
lome_model,
|
|
max_samples_per_doc,
|
|
samples_to_find,
|
|
selected_documents,
|
|
dataset_vars,
|
|
texts_df,
|
|
):
|
|
dataset = dataset_vars["dataset"]
|
|
|
|
print("# selected documents", len(selected_documents))
|
|
|
|
samples_found = {}
|
|
|
|
tar_blocks = get_tarball_blocks(dataset, lome_model)
|
|
|
|
syntax_cache = SYNTAX_ANALYSIS_CACHE_FILES[dataset]
|
|
|
|
prediction_files = []
|
|
print("# tar blocks", len(glob.glob(tar_blocks + "/*.tar")))
|
|
for block in glob.glob(tar_blocks + "/*.tar"):
|
|
with tarfile.open(block, "r") as tar_f:
|
|
block_prediction_files = [
|
|
m.name for m in tar_f.getmembers() if m.name.endswith(".comm.json")
|
|
]
|
|
print("\t# prediction files", len(prediction_files))
|
|
matching_prediction_files = [
|
|
pf
|
|
for pf in block_prediction_files
|
|
if re.search(r"/(\d+)/lome_(\d+).comm.json", pf).group(2)
|
|
in selected_documents
|
|
]
|
|
print("\t# matching prediction files", len(matching_prediction_files))
|
|
print("\t")
|
|
prediction_files.extend(matching_prediction_files)
|
|
print(len(prediction_files))
|
|
|
|
while prediction_files and len(samples_found) < samples_to_find:
|
|
|
|
print(
|
|
f"\t\tsamples_found: {len(samples_found)}//prediction_files left: {len(prediction_files)}"
|
|
)
|
|
|
|
|
|
pf = random.choice(prediction_files)
|
|
prediction_files.remove(pf)
|
|
|
|
print(pf)
|
|
|
|
|
|
doc_id = os.path.basename(pf).split(".")[0].split("_")[1]
|
|
doc_key = doc_id[:2]
|
|
tarball = get_tarball_blocks(dataset, lome_model) + f"/block_{doc_key}.tar"
|
|
with tarfile.open(tarball, "r") as tar_f:
|
|
pf_obj = io.TextIOWrapper(tar_f.extractfile(pf))
|
|
(
|
|
sents,
|
|
pred_structures,
|
|
syntax_analyses,
|
|
role_analyses,
|
|
) = process_prediction_file(
|
|
filename=pf,
|
|
file_obj=pf_obj,
|
|
dataset_name=dataset_vars["dataset"],
|
|
syntax_cache=syntax_cache,
|
|
deep_frames_cache=deep_frames_cache,
|
|
)
|
|
|
|
if syntax_analyses is None:
|
|
continue
|
|
|
|
(
|
|
frame_sents,
|
|
frame_pred_structures,
|
|
frame_syntax_analyses,
|
|
frame_role_mappings,
|
|
) = ([], [], [], [])
|
|
for s, pred, syn, rol in zip(
|
|
sents, pred_structures, syntax_analyses, role_analyses
|
|
):
|
|
for fs in pred.values():
|
|
fs_syn = get_syntax_info(fs, syn)
|
|
fs_rol = rol[fs.target.tokens_idx[0]]
|
|
|
|
frame_matches = frame == "*" or fs.frame == frame
|
|
construction_matches = (
|
|
construction == "*"
|
|
or fs_syn["syn_construction"] == construction
|
|
)
|
|
role_matches = role == "*" or role in [r for r, _ in fs.roles]
|
|
if role != "*":
|
|
dependency_matches = dependency == "*" or (
|
|
role,
|
|
dependency,
|
|
) in [(r, d) for r, (d, _) in fs_rol.items()]
|
|
else:
|
|
dependency_matches = dependency == "*" or dependency in [
|
|
d for d, _ in fs_rol.values()
|
|
]
|
|
if (
|
|
frame_matches
|
|
and construction_matches
|
|
and role_matches
|
|
and dependency_matches
|
|
):
|
|
frame_sents.append(s)
|
|
frame_pred_structures.append(pred)
|
|
frame_syntax_analyses.append(syn)
|
|
frame_role_mappings.append(rol)
|
|
|
|
|
|
if not frame_sents:
|
|
continue
|
|
for _ in range(max_samples_per_doc):
|
|
selected_idx = random.randrange(len(frame_sents))
|
|
if not (pf, selected_idx) in samples_found:
|
|
sample = (
|
|
frame_sents[selected_idx],
|
|
frame_pred_structures[selected_idx],
|
|
frame_syntax_analyses[selected_idx],
|
|
frame_role_mappings[selected_idx],
|
|
)
|
|
if sample not in samples_found.values():
|
|
samples_found[(pf, selected_idx)] = sample
|
|
|
|
|
|
output = []
|
|
for (pf, idx), (sent, structs, syntax, roles) in samples_found.items():
|
|
|
|
re_m = re.search(r"/(\d+)/lome_(\d+).comm.json", pf)
|
|
event_id = re_m.group(1)
|
|
doc_id = re_m.group(2)
|
|
|
|
output.append(
|
|
{
|
|
"sentence": sent,
|
|
"fn_structures": [
|
|
dataclasses.asdict(fs) for fs in structs.values()
|
|
],
|
|
"syntax": syntax,
|
|
"roles": roles,
|
|
"meta": {
|
|
"event_id": event_id,
|
|
"doc_id": doc_id,
|
|
"text_meta": get_text_meta(doc_id, texts_df),
|
|
},
|
|
}
|
|
)
|
|
return output
|
|
|
|
|
|
|
|
def security_check():
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.before_request
|
|
def init_session():
|
|
if session.get("initialized"):
|
|
return
|
|
|
|
print("Initializing...")
|
|
_switch_dataset("femicides/olv", True)
|
|
session["initialized"] = True
|
|
return
|
|
|
|
|
|
@app.route("/")
|
|
def index():
|
|
return redirect(url_for("demo"))
|
|
|
|
|
|
@app.route("/explore")
|
|
def start():
|
|
return render_template("index.html")
|
|
|
|
|
|
@app.route("/demo")
|
|
def demo():
|
|
return render_template("demo.html")
|
|
|
|
|
|
@app.route("/check_password", methods=["POST"])
|
|
def check_password():
|
|
entered_password = request.form["password"]
|
|
if entered_password == PASSWORD:
|
|
resp = jsonify({"success": True})
|
|
resp.set_cookie("auth_key", AUTH_KEY)
|
|
return resp
|
|
else:
|
|
return jsonify({"success": False})
|
|
|
|
|
|
@app.route("/switch_dataset")
|
|
def switch_dataset():
|
|
new_dataset = request.args.get("dataset")
|
|
_switch_dataset(new_dataset)
|
|
return jsonify({"result": "changed_dataset"})
|
|
|
|
|
|
def _switch_dataset(new_dataset, first_time=False):
|
|
print(first_time)
|
|
if not first_time:
|
|
if new_dataset == session["dataset_vars"]["dataset"]:
|
|
return jsonify({"result": "no_change"})
|
|
|
|
session["dataset_vars"] = get_dataset_variables(new_dataset)
|
|
if new_dataset == "femicides/rai":
|
|
session["provider_name_map"] = {}
|
|
elif new_dataset == "crashes/thecrashes":
|
|
_, name_map = read_crashes_provider_attrs()
|
|
session["provider_name_map"] = name_map
|
|
elif new_dataset == "migration/pavia":
|
|
session["provider_name_map"] = {}
|
|
else:
|
|
session["provider_name_map"] = {}
|
|
|
|
|
|
|
|
@app.route("/analyze")
|
|
def analyze():
|
|
|
|
event_id = request.args.get("event")
|
|
doc_id = request.args.get("document")
|
|
lome_model = request.args.get("model")
|
|
dataset = session["dataset_vars"]["dataset"]
|
|
|
|
if dataset in PROTECTED_DATASETS:
|
|
if not security_check():
|
|
abort(403)
|
|
|
|
print(dataset)
|
|
output = analyze_single_document(
|
|
doc_id,
|
|
event_id,
|
|
lome_model,
|
|
dataset,
|
|
DATASETS[session["dataset_vars"]["dataset"]]["texts_df"],
|
|
deep_frames_cache=deep_frames_cache
|
|
)
|
|
return jsonify(output)
|
|
|
|
|
|
@app.route("/sample_frame")
|
|
def sample_frame():
|
|
|
|
dataset = session["dataset_vars"]["dataset"]
|
|
if dataset in PROTECTED_DATASETS and not security_check():
|
|
abort(403)
|
|
|
|
frame = request.args.get("frame")
|
|
construction = request.args.get("construction")
|
|
role = request.args.get("role")
|
|
dependency = request.args.get("dependency")
|
|
|
|
texts_df = DATASETS[session["dataset_vars"]["dataset"]]["texts_df"]
|
|
events_df = DATASETS[session["dataset_vars"]["dataset"]]["events_df"]
|
|
filtered_docs = filter_documents(
|
|
session["dataset_vars"],
|
|
events_df,
|
|
texts_df,
|
|
PROVIDER_ATTRS[session["dataset_vars"]["dataset"]],
|
|
session["provider_name_map"],
|
|
)
|
|
selected_documents = {doc["doc_id"] for doc in filtered_docs}
|
|
lome_model = request.args.get("model")
|
|
samples_to_find = int(request.args.get("nsamples", 5))
|
|
max_samples_per_doc = int(request.args.get("nperdoc", 10))
|
|
|
|
output = get_analyze_frame_samples(
|
|
construction,
|
|
frame,
|
|
dependency,
|
|
role,
|
|
lome_model,
|
|
max_samples_per_doc,
|
|
samples_to_find,
|
|
selected_documents,
|
|
session["dataset_vars"],
|
|
DATASETS[session["dataset_vars"]["dataset"]]["texts_df"],
|
|
)
|
|
return jsonify(output)
|
|
|
|
|
|
@app.route("/lus_to_frames")
|
|
def get_frames_from_lus():
|
|
lus = request.args.get("lus", "").split("+")
|
|
frames = set()
|
|
for lu in lus:
|
|
frames.update({lu_info.frame.name for lu_info in fn.lus(r"^" + lu + r"\.")})
|
|
print(frames)
|
|
return jsonify({"frames": sorted(frames)})
|
|
|
|
|
|
def format_frame_description(frame_def_xml):
|
|
frame_def_fmt = [frame_def_xml.text] if frame_def_xml.text else []
|
|
for elem in frame_def_xml:
|
|
if elem.tag == "ex":
|
|
break
|
|
elif elem.tag == "fen":
|
|
frame_def_fmt.append("<b>" + elem.text + "</b>")
|
|
elif elem.text:
|
|
frame_def_fmt.append(elem.text)
|
|
if elem.tail:
|
|
frame_def_fmt.append(elem.tail)
|
|
return frame_def_fmt
|
|
|
|
|
|
def get_alt_perspectives(frame_info, frame_name):
|
|
alt_perspectives = []
|
|
result_frames = [fr.subFrameName for fr in frame_info.frameRelations if fr.type.name == "Causative_of" and fr.superFrameName == frame_name]
|
|
if result_frames:
|
|
alt_perspectives.append({"frame": result_frames[0], "type": "result"})
|
|
|
|
cause_frames = [fr.superFrameName for fr in frame_info.frameRelations if fr.type.name == "Causative_of" and fr.subFrameName == frame_name]
|
|
if cause_frames:
|
|
alt_perspectives.append({"frame": cause_frames[0], "type": "causer"})
|
|
|
|
|
|
|
|
|
|
|
|
neutral_frames = [fr.superFrameName for fr in frame_info.frameRelations if fr.type.name == "Perspective_on" and fr.subFrameName == frame_name]
|
|
if neutral_frames:
|
|
flipped_frames = [fr.subFrameName for fr in fn.frame(neutral_frames[0]).frameRelations if fr.type.name == "Perspective_on" and fr.superFrameName == neutral_frames[0] and fr.subFrameName != frame_name]
|
|
if flipped_frames:
|
|
alt_perspectives.extend([{"frame": ff, "type": "flipped"} for ff in flipped_frames])
|
|
|
|
return alt_perspectives
|
|
|
|
|
|
@app.route("/frame_info")
|
|
def get_frame_info():
|
|
frame_name = request.args.get("frame").strip()
|
|
try:
|
|
print(repr(frame_name))
|
|
print(type(frame_name))
|
|
try:
|
|
frame_info = fn.frame(frame_name)
|
|
except KeyError:
|
|
|
|
frame_info = fn.frame(frame_name)
|
|
try:
|
|
|
|
definition_first_sent = nltk.sent_tokenize(frame_info.definitionMarkup)[0] + "</def-root>"
|
|
frame_def_xml = ET.fromstring(definition_first_sent)
|
|
except ET.XMLSyntaxError:
|
|
|
|
frame_def_xml = ET.fromstring(frame_info.definitionMarkup)
|
|
|
|
frame_def_fmt = format_frame_description(frame_def_xml)
|
|
|
|
exemplars = [
|
|
{
|
|
"text": exemplar.text,
|
|
"target_lu": lu_name,
|
|
"target_idx": list(exemplar["Target"][0]),
|
|
"core_fes": {
|
|
role: exemplar.text[start_idx:end_idx]
|
|
for role, start_idx, end_idx in exemplar.FE[0]
|
|
if role in [fe for fe, fe_info in frame_info.FE.items() if fe_info.coreType == "Core"]
|
|
}
|
|
}
|
|
for lu_name, lu_info in frame_info["lexUnit"].items()
|
|
for exemplar in lu_info.exemplars
|
|
]
|
|
|
|
|
|
exa_typicality_scores = [(exa, len(exa["text"]) - 25 * len(exa["core_fes"])) for exa in exemplars]
|
|
if exa_typicality_scores:
|
|
typical_exemplar = min(exa_typicality_scores, key=lambda t: t[1])[0]
|
|
else:
|
|
typical_exemplar = None
|
|
|
|
alt_perspectives = get_alt_perspectives(frame_info, frame_name)
|
|
|
|
return jsonify({
|
|
"result": "OK",
|
|
"frameDefinition": frame_def_fmt,
|
|
"exemplars": exemplars,
|
|
"altPerspectives": alt_perspectives,
|
|
"typicalExemplar": {
|
|
"text": typical_exemplar["text"],
|
|
"coreFrameElements": typical_exemplar["core_fes"]
|
|
} if typical_exemplar else None
|
|
})
|
|
|
|
except FramenetError:
|
|
return jsonify({"result": "FAIL", "info": "could not find frame"})
|
|
|
|
|
|
|
|
@app.route("/frames")
|
|
def get_frames():
|
|
return jsonify(session["dataset_vars"]["frames"])
|
|
|
|
|
|
@app.route("/constructions")
|
|
def get_constructions():
|
|
return jsonify(POSSIBLE_CONSTRUCTIONS)
|
|
|
|
|
|
@app.route("/event_filters")
|
|
def get_event_filters():
|
|
dataset = session["dataset_vars"]["dataset"]
|
|
if dataset in PROTECTED_DATASETS and not security_check():
|
|
abort(403)
|
|
|
|
if session["dataset_vars"]["dataset"] == "femicides/rai":
|
|
events_df = DATASETS[session["dataset_vars"]["dataset"]]["events_df"]
|
|
event_categories = events_df["event:category"].unique().tolist()
|
|
regions = sorted(events_df["event:region"].unique().tolist())
|
|
sem_location = sorted(events_df["event:semantic_location"].unique().tolist())
|
|
victim_age = VICTIM_AGE_GROUPS
|
|
victim_nationality = [
|
|
"Italia",
|
|
ALL_FOREIGN_NATIONALITIES,
|
|
"non rilevato",
|
|
] + sorted(
|
|
[
|
|
i
|
|
for i in events_df["victim:nationality"].dropna().unique().tolist()
|
|
if i not in ["Italia", "non rilevato"]
|
|
]
|
|
)
|
|
attacker_nationality = [
|
|
"Italia",
|
|
ALL_FOREIGN_NATIONALITIES,
|
|
"non rilevato",
|
|
] + sorted(
|
|
[
|
|
i
|
|
for i in events_df["attacker:nationality"].dropna().unique().tolist()
|
|
if i not in ["Italia", "non rilevato"]
|
|
]
|
|
)
|
|
|
|
victim_occupation = sorted(
|
|
[
|
|
i
|
|
for i in events_df["victim:occupation"].dropna().unique().tolist()
|
|
]
|
|
)
|
|
|
|
attacker_occupation = sorted(
|
|
[
|
|
i
|
|
for i in events_df["attacker:occupation"].dropna().unique().tolist()
|
|
]
|
|
)
|
|
|
|
|
|
return jsonify(
|
|
{
|
|
"event_categories": event_categories,
|
|
"regions": regions,
|
|
"sem_location": sem_location,
|
|
"victim_age": victim_age,
|
|
"victim_nationality": victim_nationality,
|
|
"attacker_nationality": attacker_nationality,
|
|
"victim_occupation": victim_occupation,
|
|
"attacker_occupation": attacker_occupation
|
|
}
|
|
)
|
|
|
|
elif session["dataset_vars"]["dataset"] == "femicides/olv":
|
|
events_df = DATASETS[session["dataset_vars"]["dataset"]]["events_df"]
|
|
regions = sorted([str(r) for r in events_df["event:region"].unique().tolist()])
|
|
victim_age = VICTIM_AGE_GROUPS
|
|
|
|
return jsonify(
|
|
{
|
|
"regions": regions,
|
|
"victim_age": victim_age,
|
|
}
|
|
)
|
|
|
|
|
|
elif session["dataset_vars"]["dataset"] == "crashes/thecrashes":
|
|
events_df = DATASETS[session["dataset_vars"]["dataset"]]["events_df"]
|
|
filters = {
|
|
"outcomes": ["no one", "one or more people"],
|
|
"imbalanced": ["yes", "no"],
|
|
}
|
|
return jsonify(filters)
|
|
|
|
else:
|
|
return jsonify({})
|
|
|
|
|
|
@app.route("/dep_labels")
|
|
def get_dep_labels():
|
|
dep_labels = set()
|
|
with open("resources/dep_labels.txt", encoding="utf-8") as f:
|
|
for line in f:
|
|
dep_labels.add(line.strip())
|
|
return jsonify(sorted(dep_labels))
|
|
|
|
|
|
@app.route("/role_labels")
|
|
def get_role_labels():
|
|
frame = request.args.get("frame")
|
|
roles = fn_frames_to_roles.get(frame)
|
|
if roles is not None:
|
|
return jsonify(roles)
|
|
else:
|
|
return jsonify([])
|
|
|
|
|
|
@app.route("/doc_filters")
|
|
def get_doc_filters():
|
|
|
|
texts_df = DATASETS[session["dataset_vars"]["dataset"]]["texts_df"]
|
|
|
|
provider_attrs = PROVIDER_ATTRS[session["dataset_vars"]["dataset"]]
|
|
|
|
if session["dataset_vars"]["dataset"] == "crashes/thecrashes":
|
|
|
|
name_map = session["provider_name_map"]
|
|
providers = sorted(
|
|
texts_df["provider"]
|
|
.apply(lambda prov: name_map.get(prov))
|
|
.dropna()
|
|
.unique()
|
|
.tolist()
|
|
)
|
|
|
|
provider_provinces = sorted(
|
|
key.split(":")[1]
|
|
for key in provider_attrs
|
|
if key.startswith("Province:")
|
|
)
|
|
provider_content_types = sorted(
|
|
key.split(":")[1]
|
|
for key in provider_attrs
|
|
if key.startswith("ContentType:")
|
|
)
|
|
|
|
provider_medium_types = sorted(
|
|
key.split(":")[1]
|
|
for key in provider_attrs
|
|
if key.startswith("MediumType:")
|
|
)
|
|
|
|
provider_owners = sorted(
|
|
key.split(":")[1]
|
|
for key in provider_attrs
|
|
if key.startswith("MediaOwner:")
|
|
)
|
|
|
|
else:
|
|
providers = sorted(texts_df["provider"].dropna().unique().tolist())
|
|
provider_provinces = []
|
|
provider_content_types = []
|
|
provider_medium_types = []
|
|
provider_owners = []
|
|
|
|
return jsonify(
|
|
{
|
|
"providers": providers,
|
|
"provider_provinces": provider_provinces,
|
|
"provider_content_types": provider_content_types,
|
|
"provider_medium_types": provider_medium_types,
|
|
"provider_owners": provider_owners,
|
|
}
|
|
)
|
|
|
|
|
|
def apply_doc_filters(
|
|
doc_filters: List[str],
|
|
provider_attrs: dict,
|
|
prov_name_map: dict,
|
|
texts_df: pd.DataFrame,
|
|
):
|
|
if not doc_filters:
|
|
all_doc_ids = set(int(eid) for eid in texts_df["text_id"].tolist())
|
|
return all_doc_ids
|
|
|
|
|
|
filters_attr_values = defaultdict(list)
|
|
for doc_filter in doc_filters:
|
|
_, attribute, value = doc_filter.split("::")
|
|
filters_attr_values[attribute].append(value)
|
|
|
|
selected_docs = texts_df
|
|
for attribute, values in filters_attr_values.items():
|
|
attr_conditions = []
|
|
|
|
for value in values:
|
|
if attribute == "days_after":
|
|
|
|
|
|
selected_docs = selected_docs.dropna(subset=["days_after_event"])
|
|
|
|
if value == "day":
|
|
condition = selected_docs["days_after_event"] < 1
|
|
elif value == "week":
|
|
condition = selected_docs["days_after_event"].isin(range(1, 7))
|
|
elif value == "month":
|
|
condition = selected_docs["days_after_event"].isin(range(7, 30))
|
|
elif value == "year":
|
|
condition = selected_docs["days_after_event"].isin(range(30, 365))
|
|
else:
|
|
condition = selected_docs["days_after_event"] > 365
|
|
|
|
elif session["dataset_vars"]["dataset"] == "femicides/rai":
|
|
if any(attribute.startswith(key) for key in ["politics:", "type", "area"]):
|
|
providers = provider_attrs[attribute + ":" + value]
|
|
condition = selected_docs["provider"].isin(providers)
|
|
else:
|
|
condition = selected_docs[attribute] == value
|
|
|
|
elif session["dataset_vars"]["dataset"] == "femicides/olv":
|
|
condition = selected_docs[attribute] == value
|
|
|
|
elif session["dataset_vars"]["dataset"] == "crashes/thecrashes":
|
|
|
|
if attribute == "provider":
|
|
condition = selected_docs["provider"].apply(lambda prov: prov_name_map.get(prov)) == value
|
|
|
|
elif attribute in [
|
|
"area",
|
|
"country",
|
|
"province",
|
|
"content_type",
|
|
"medium_type",
|
|
"owner",
|
|
]:
|
|
|
|
attribute_altname = {
|
|
"area": "RegionalScope",
|
|
"country": "Country",
|
|
"province": "Province",
|
|
"content_type": "ContentType",
|
|
"medium_type": "MediumType",
|
|
"owner": "MediaOwner",
|
|
}[attribute]
|
|
|
|
providers = provider_attrs[attribute_altname + ":" + value]
|
|
condition = selected_docs["provider"].apply(lambda prov: prov_name_map.get(prov)).isin(providers)
|
|
else:
|
|
condition = selected_docs[attribute] == value
|
|
|
|
elif session["dataset_vars"]["dataset"] == "migration/pavia":
|
|
if attribute.startswith("politics") or attribute.startswith("religion"):
|
|
providers = provider_attrs[attribute + ":" + value]
|
|
condition = selected_docs["provider"].isin(providers)
|
|
else:
|
|
condition = selected_docs[attribute] == value
|
|
|
|
attr_conditions.append(condition)
|
|
|
|
selected_docs = selected_docs[functools.reduce(np.logical_or, attr_conditions)]
|
|
|
|
return set(int(eid) for eid in selected_docs["text_id"].tolist())
|
|
|
|
|
|
def apply_event_filters(ev_filters: List[str], events_df: pd.DataFrame):
|
|
if not ev_filters:
|
|
all_event_ids = set(int(eid) for eid in events_df["event:id"].tolist())
|
|
return all_event_ids
|
|
|
|
selected_events = events_df
|
|
for ev_filter in ev_filters:
|
|
print(ev_filter)
|
|
_, attribute, value = ev_filter.split("::")
|
|
print(attribute)
|
|
|
|
if session["dataset_vars"]["dataset"] in ["femicides/rai", "femicides/olv"]:
|
|
if attribute in ["victim:age"]:
|
|
print(value)
|
|
if "-" not in value:
|
|
|
|
age_from = int(value)
|
|
age_to = 200
|
|
else:
|
|
age_from = int(value.split("-")[0])
|
|
age_to = int(value.split("-")[1])
|
|
events_with_age = selected_events[
|
|
selected_events[attribute] != "non rilevato"
|
|
]
|
|
selected_events = events_with_age[
|
|
events_with_age[attribute].astype(int).isin(range(age_from, age_to))
|
|
]
|
|
elif attribute in ["victim:nationality", "attacker:nationality"]:
|
|
if value == ALL_FOREIGN_NATIONALITIES:
|
|
selected_events = selected_events.dropna(subset=[attribute])
|
|
selected_events = selected_events[
|
|
~selected_events[attribute].isin(["Italia", "non rilevato", "nessuno", "sconosciuto"])
|
|
]
|
|
else:
|
|
selected_events = selected_events[
|
|
selected_events[attribute] == value
|
|
]
|
|
else:
|
|
selected_events = selected_events[selected_events[attribute] == value]
|
|
|
|
elif session["dataset_vars"]["dataset"] == "crashes/thecrashes":
|
|
|
|
if attribute.startswith("imbalanced"):
|
|
|
|
selected_events = selected_events[
|
|
(selected_events["outcomes:dead:cyclist"] > 0)
|
|
| (selected_events["outcomes:dead:pedestrian"] > 0)
|
|
| (selected_events["outcomes:injured:cyclist"] > 0)
|
|
| (selected_events["outcomes:injured:pedestrian"] > 0)
|
|
]
|
|
|
|
|
|
selected_events = selected_events[
|
|
(selected_events["outcomes:injured:vehicle"] == 0)
|
|
& (selected_events["outcomes:dead:vehicle"] == 0)
|
|
]
|
|
|
|
|
|
selected_events = selected_events[
|
|
(selected_events["vehicle_involved"] == 1)
|
|
]
|
|
|
|
if attribute.startswith("outcomes:"):
|
|
|
|
outcome = attribute.split(":")[1]
|
|
person = attribute.split(":")[2]
|
|
|
|
if outcome == "deadinjured":
|
|
if person == "cyclistpedestrian":
|
|
if value == "no one":
|
|
selected_events = selected_events[
|
|
(selected_events["outcomes:dead:cyclist"] == 0)
|
|
& (selected_events["outcomes:dead:pedestrian"] == 0)
|
|
& (selected_events["outcomes:injured:cyclist"] == 0)
|
|
& (selected_events["outcomes:injured:pedestrian"] == 0)
|
|
]
|
|
else:
|
|
selected_events = selected_events[
|
|
(selected_events["outcomes:dead:cyclist"] > 0)
|
|
| (selected_events["outcomes:dead:pedestrian"] > 0)
|
|
| (selected_events["outcomes:injured:cyclist"] > 0)
|
|
| (selected_events["outcomes:injured:pedestrian"] > 0)
|
|
]
|
|
else:
|
|
if value == "no one":
|
|
selected_events = selected_events[
|
|
(selected_events[f"outcomes:dead:{person}"] == 0)
|
|
& (selected_events[f"outcomes:injured:{person}"] == 0)
|
|
]
|
|
else:
|
|
selected_events = selected_events[
|
|
(selected_events[f"outcomes:dead:{person}"] > 0)
|
|
| (selected_events[f"outcomes:injured:{person}"] > 0)
|
|
]
|
|
|
|
else:
|
|
if person == "cyclistpedestrian":
|
|
if value == "no one":
|
|
selected_events = selected_events[
|
|
(selected_events[f"outcomes:{outcome}:cyclist"] == 0)
|
|
& (
|
|
selected_events[f"outcomes:{outcome}:pedestrian"]
|
|
== 0
|
|
)
|
|
]
|
|
else:
|
|
selected_events = selected_events[
|
|
(selected_events[f"outcomes:{outcome}:cyclist"] == 0)
|
|
| (
|
|
selected_events[f"outcomes:{outcome}:pedestrian"]
|
|
> 0
|
|
)
|
|
]
|
|
else:
|
|
if value == "no one":
|
|
selected_events = selected_events[
|
|
selected_events[attribute] == 0
|
|
]
|
|
else:
|
|
selected_events = selected_events[
|
|
selected_events[attribute] > 0
|
|
]
|
|
|
|
return set(int(eid) for eid in selected_events["event:id"].tolist())
|
|
|
|
|
|
@app.route("/documents")
|
|
def documents():
|
|
|
|
events_df = DATASETS[session["dataset_vars"]["dataset"]]["events_df"]
|
|
texts_df = DATASETS[session["dataset_vars"]["dataset"]]["texts_df"]
|
|
document_list = filter_documents(
|
|
session["dataset_vars"],
|
|
events_df,
|
|
texts_df,
|
|
PROVIDER_ATTRS[session["dataset_vars"]["dataset"]],
|
|
session["provider_name_map"],
|
|
)
|
|
return jsonify(sorted(document_list, key=lambda d: int(d["event_id"])))
|
|
|
|
|
|
def filter_documents(dataset_vars, events_df, texts_df, provider_attrs, name_map):
|
|
event_filters = read_filters("event_filters")
|
|
doc_filters = read_filters("doc_filters")
|
|
selected_events = apply_event_filters(event_filters, events_df)
|
|
selected_documents = apply_doc_filters(
|
|
doc_filters, provider_attrs, name_map, texts_df
|
|
)
|
|
|
|
document_list = []
|
|
|
|
blocks = get_tarball_blocks(dataset_vars["dataset"])
|
|
for tarball in glob.glob(blocks + "/*.tar"):
|
|
with tarfile.open(tarball, "r") as tar_f:
|
|
for doc in [f.name for f in tar_f.getmembers() if f.name.endswith(".comm.txt")]:
|
|
|
|
re_m = re.search(r"/(\d+)/lome_(\d+).comm.txt", doc)
|
|
event_id = re_m.group(1)
|
|
doc_id = re_m.group(2)
|
|
if (int(event_id) not in selected_events) or (
|
|
int(doc_id) not in selected_documents
|
|
):
|
|
continue
|
|
document_list.append({"event_id": event_id, "doc_id": doc_id})
|
|
return document_list
|
|
|
|
|
|
def read_filters(arg_name):
|
|
filter_str = request.args.get(arg_name)
|
|
if filter_str:
|
|
filters = filter_str.split("+")
|
|
else:
|
|
filters = []
|
|
return filters
|
|
|
|
|
|
@app.route("/frame_freq")
|
|
def frame_freq():
|
|
|
|
texts_df = DATASETS[session["dataset_vars"]["dataset"]]["texts_df"]
|
|
|
|
|
|
lome_model = request.args.get("model")
|
|
|
|
|
|
event_filters = read_filters("event_filters")
|
|
doc_filters = read_filters("doc_filters")
|
|
events_df = DATASETS[session["dataset_vars"]["dataset"]]["events_df"]
|
|
selected_events = apply_event_filters(event_filters, events_df)
|
|
selected_documents = apply_doc_filters(
|
|
doc_filters,
|
|
PROVIDER_ATTRS[session["dataset_vars"]["dataset"]],
|
|
session["provider_name_map"],
|
|
texts_df,
|
|
)
|
|
|
|
|
|
only_headlines = request.args.get("headlines", "n") == "y"
|
|
|
|
|
|
frame_string = request.args.get("frames").strip()
|
|
frame_filter: List[str] = frame_string.split("+")
|
|
|
|
|
|
constr_string = request.args.get("constructions").strip()
|
|
constr_filter: List[str] = constr_string.split("+") if constr_string else []
|
|
|
|
|
|
group_by_cat: bool = request.args.get("group_by_cat") == "y"
|
|
|
|
|
|
group_by_tgt: bool = request.args.get("group_by_tgt") == "y"
|
|
|
|
|
|
group_by_constr: bool = request.args.get("group_by_constr") == "y"
|
|
|
|
|
|
group_by_root: bool = request.args.get("group_by_root") == "y"
|
|
|
|
|
|
group_by_role_expr: int = int(request.args.get("group_by_role_expr"))
|
|
|
|
|
|
relative: bool = request.args.get("relative") == "y"
|
|
|
|
|
|
plot_over_days_post: bool = request.args.get("plot_over_days_post") == "y"
|
|
|
|
|
|
plot_by_year: bool = request.args.get("plot_by_year") == "y"
|
|
assert not (plot_over_days_post and plot_by_year)
|
|
|
|
|
|
days_time_window: int
|
|
try:
|
|
days_time_window_str = request.args.get("days_time_window")
|
|
if days_time_window_str is None:
|
|
days_time_window = 10
|
|
else:
|
|
days_time_window = int(days_time_window_str)
|
|
except ValueError:
|
|
days_time_window = 10
|
|
|
|
if plot_over_days_post or plot_by_year:
|
|
relevant_frame_counts = defaultdict(lambda: defaultdict(int))
|
|
deep_frame_counts = defaultdict(lambda: defaultdict(int))
|
|
all_frame_counts = defaultdict(lambda: defaultdict(int))
|
|
else:
|
|
relevant_frame_counts = defaultdict(int)
|
|
deep_frame_counts = defaultdict(int)
|
|
all_frame_counts = defaultdict(int)
|
|
|
|
|
|
totals_by_frame = defaultdict(int)
|
|
|
|
print("Processing documents....")
|
|
|
|
blocks = get_tarball_blocks(session["dataset_vars"]["dataset"], lome_model)
|
|
|
|
|
|
tmp_syntax_cache = {}
|
|
|
|
for tarball in sorted(glob.glob(blocks + "/*.tar")):
|
|
with tarfile.open(tarball, "r") as tar_f:
|
|
for mem in sorted(tar_f.getmembers(), key=lambda mem: mem.name):
|
|
if mem is None or not mem.name.endswith(".comm.json"):
|
|
continue
|
|
|
|
|
|
re_m = re.search(r"/(\d+)/lome_(\d+).comm.json", mem.name)
|
|
event_id = re_m.group(1)
|
|
doc_id = re_m.group(2)
|
|
|
|
|
|
if (int(doc_id) not in selected_documents) or (
|
|
int(event_id) not in selected_events
|
|
):
|
|
continue
|
|
if plot_over_days_post:
|
|
texts_df = DATASETS[session["dataset_vars"]["dataset"]]["texts_df"]
|
|
texts_df_dropna = texts_df.dropna(subset=["days_after_event"])
|
|
try:
|
|
df_filter = texts_df_dropna["text_id"] == int(doc_id)
|
|
time_bucket = int(
|
|
texts_df_dropna[df_filter].iloc[0]["days_after_event"]
|
|
)
|
|
except IndexError:
|
|
|
|
continue
|
|
|
|
time_bucket = max(time_bucket, 0)
|
|
|
|
|
|
time_bucket = (time_bucket // days_time_window) * days_time_window
|
|
else:
|
|
time_bucket = 0
|
|
|
|
if plot_by_year:
|
|
texts_df = DATASETS[session["dataset_vars"]["dataset"]]["texts_df"]
|
|
df_filter = texts_df["text_id"] == int(doc_id)
|
|
if "pubyear" in texts_df.columns:
|
|
time_bucket = int(texts_df[df_filter].iloc[0]["pubyear"])
|
|
elif "pubdate" in texts_df.columns:
|
|
pubdate_str = texts_df[df_filter].iloc[0]["pubdate"]
|
|
if pd.isna(pubdate_str):
|
|
continue
|
|
pub_date = datetime.datetime.strptime(pubdate_str, "%Y-%m-%d %H:%M:%S")
|
|
time_bucket = pub_date.year
|
|
else:
|
|
raise ValueError("Cannot plot by year if no `pubyear` or `pubdate` is specified!")
|
|
|
|
|
|
mem_obj = io.TextIOWrapper(tar_f.extractfile(mem))
|
|
|
|
skip_func = SKIP_FUNCTIONS[session["dataset_vars"]["dataset"]]
|
|
if skip_func is not None and skip_func(doc_id):
|
|
print(f"\tskip_func: skipping file {mem}")
|
|
continue
|
|
|
|
|
|
(
|
|
_,
|
|
pred_structures,
|
|
syntax_analyses,
|
|
role_analyses,
|
|
) = process_prediction_file(
|
|
filename=mem.name,
|
|
dataset_name=session["dataset_vars"]["dataset"],
|
|
syntax_cache=SYNTAX_ANALYSIS_CACHE_FILES[session["dataset_vars"]["dataset"]],
|
|
tmp_cache=tmp_syntax_cache,
|
|
file_obj=mem_obj,
|
|
deep_frames_cache=deep_frames_cache
|
|
)
|
|
|
|
for sent_idx, (struct_dict, syntax_dict, roles) in enumerate(zip(
|
|
pred_structures, syntax_analyses, role_analyses
|
|
)):
|
|
if only_headlines and sent_idx > 1:
|
|
continue
|
|
for struct in struct_dict.values():
|
|
|
|
frame_key = struct.frame
|
|
deep_frame_key = struct.deep_frame
|
|
|
|
syntax_info = get_syntax_info(struct, syntax_dict)
|
|
syntax_constr = syntax_info["syn_construction"]
|
|
syntax_cat = syntax_info["syn_category"]
|
|
syntax_at_root = is_at_root(syntax_info)
|
|
|
|
if constr_filter and syntax_constr not in constr_filter:
|
|
continue
|
|
|
|
totals_by_frame[struct.frame] += 1
|
|
|
|
if group_by_cat:
|
|
count_keys = [f"{frame_key}::{syntax_cat}"]
|
|
deep_count_keys = [f"{frame_key}::{syntax_cat}"]
|
|
|
|
elif group_by_tgt:
|
|
tgt_str = (
|
|
" ".join(struct.target.tokens_str)
|
|
.strip("«».,()□�?'\"")
|
|
.strip()
|
|
.lower()
|
|
)
|
|
count_keys = [f"{frame_key}::{tgt_str}"]
|
|
deep_count_keys = [f"{frame_key}::{tgt_str}"]
|
|
|
|
elif group_by_constr and group_by_root:
|
|
count_keys = [
|
|
f"{frame_key}/{syntax_constr}::{'root' if syntax_at_root else 'non-root'}"
|
|
]
|
|
deep_count_keys = [
|
|
f"{deep_frame_key}::{syntax_constr}::{'root' if syntax_at_root else 'non-root'}"
|
|
]
|
|
elif group_by_constr:
|
|
count_keys = [f"{frame_key}::{syntax_constr}"]
|
|
deep_count_keys = [f"{deep_frame_key}::{syntax_constr}"]
|
|
elif group_by_root:
|
|
count_keys = [
|
|
f"{frame_key}::{'root' if syntax_at_root else 'non-root'}"
|
|
]
|
|
deep_count_keys = [
|
|
f"{deep_frame_key}::{'root' if syntax_at_root else 'non-root'}"
|
|
]
|
|
|
|
elif group_by_role_expr:
|
|
if group_by_role_expr == 1:
|
|
role_exprs = [r for r, _ in struct.roles]
|
|
elif group_by_role_expr == 2:
|
|
role_exprs = get_role_expressions(
|
|
struct, roles, None, 1, False
|
|
)
|
|
elif group_by_role_expr == 3:
|
|
role_exprs = get_role_expressions(
|
|
struct, roles, session["frames_to_roles"], 1, False
|
|
)
|
|
elif group_by_role_expr == 4:
|
|
role_exprs = get_role_expressions(
|
|
struct, roles, None, None, True
|
|
)
|
|
else:
|
|
raise ValueError(
|
|
"Unknown value for param group_by_role_expr"
|
|
)
|
|
count_keys = []
|
|
deep_count_keys = []
|
|
for role_expr in role_exprs:
|
|
|
|
if group_by_role_expr == 4:
|
|
role_name, depth = role_expr.split("::")
|
|
depth = abs(int(depth))
|
|
if depth > 3:
|
|
depth = ">3"
|
|
role_expr = f"{role_name}::{depth}"
|
|
|
|
count_keys.append(f"{frame_key}::{role_expr}")
|
|
deep_count_keys.append(f"{deep_frame_key}::{role_expr}")
|
|
else:
|
|
count_keys = [struct.frame]
|
|
deep_count_keys = [struct.deep_frame]
|
|
|
|
for ck, dck in zip(count_keys, deep_count_keys):
|
|
if struct.frame in frame_filter:
|
|
if plot_over_days_post or plot_by_year:
|
|
relevant_frame_counts[time_bucket][ck] += 1
|
|
deep_frame_counts[time_bucket][dck] += 1
|
|
else:
|
|
relevant_frame_counts[ck] += 1
|
|
deep_frame_counts[dck] += 1
|
|
if plot_over_days_post or plot_by_year:
|
|
all_frame_counts[time_bucket][ck] += 1
|
|
else:
|
|
all_frame_counts[ck] += 1
|
|
print("Computing frame counts...")
|
|
|
|
if plot_over_days_post or plot_by_year:
|
|
data_and_names = [
|
|
(relevant_frame_counts, "relevant_frame_counts"),
|
|
(deep_frame_counts, "deep_frame_counts"),
|
|
(all_frame_counts, "all_frame_counts"),
|
|
]
|
|
data_out = {}
|
|
for (data, name) in data_and_names:
|
|
traces = defaultdict(lambda: {"x": [], "y": []})
|
|
for time_bucket in sorted(data):
|
|
total_count = sum(data[time_bucket].values())
|
|
for count_keys, count in data[time_bucket].items():
|
|
traces[count_keys]["x"].append(time_bucket)
|
|
traces[count_keys]["y"].append(
|
|
count / total_count if relative else count
|
|
)
|
|
data_out[name] = traces
|
|
return jsonify(data_out)
|
|
|
|
else:
|
|
relevant_frames_sr = pd.Series(data=relevant_frame_counts).sort_values(
|
|
ascending=False
|
|
)
|
|
deep_frames_sr = pd.Series(data=deep_frame_counts).sort_values(ascending=False)
|
|
all_frames_sr = pd.Series(data=all_frame_counts).sort_values(ascending=False)
|
|
|
|
if relative and group_by_role_expr > 0:
|
|
print("totals_by_frame=", totals_by_frame)
|
|
print("frame_filter=", frame_filter)
|
|
denom = totals_by_frame[frame_filter[0]]
|
|
print("denom=", denom)
|
|
relevant_frames_sr /= denom
|
|
deep_frames_sr /= deep_frames_sr.sum()
|
|
all_frames_sr /= all_frames_sr.sum()
|
|
|
|
elif relative:
|
|
relevant_frames_sr /= relevant_frames_sr.sum()
|
|
deep_frames_sr /= deep_frames_sr.sum()
|
|
all_frames_sr /= all_frames_sr.sum()
|
|
|
|
return jsonify(
|
|
{
|
|
"relevant_frame_counts": {
|
|
"x": relevant_frames_sr.index.tolist(),
|
|
"y": relevant_frames_sr.values.tolist(),
|
|
},
|
|
"deep_frame_counts": {
|
|
"x": deep_frames_sr.index.tolist(),
|
|
"y": deep_frames_sr.values.tolist(),
|
|
},
|
|
"all_frame_counts": {
|
|
"x": all_frames_sr.index.tolist(),
|
|
"y": all_frames_sr.values.tolist(),
|
|
},
|
|
}
|
|
)
|
|
|
|
|
|
|
|
|
|
@app.route("/similar_frames")
|
|
def similar_frames():
|
|
|
|
if gensim_m is None:
|
|
return jsonify({"result": "FAIL", "reason": "no GENSIM model has been loaded, please call /load_gensim and try again"})
|
|
|
|
words_in = [w for w in request.args.get("words_in").split("+") if "glove_" + w in gensim_m]
|
|
if not words_in:
|
|
return jsonify({"result": "FAIL", "reason": "No input words given"})
|
|
|
|
try:
|
|
matches = [res for res in gensim_m.most_similar(positive=["glove_" + w for w in words_in], topn=100) if res[0].startswith("fn_")]
|
|
except KeyError:
|
|
return jsonify({"result": "FAIL", "reason": "One of the input words does not exist in the GloVe vocabulary"})
|
|
|
|
frames = [m[0].lstrip("fn_") for m in matches]
|
|
probas = [m[1] for m in matches]
|
|
|
|
return jsonify({
|
|
"result": "OK",
|
|
"frames": frames,
|
|
"probabilities": probas,
|
|
})
|
|
|
|
|
|
@app.route("/sociofillmore")
|
|
def sociofillmore():
|
|
|
|
|
|
input_text = request.args.get("text", "")
|
|
language = request.args.get("language", "en")
|
|
sentences = [s.text for s in spacy_model_ud(input_text).sents]
|
|
|
|
|
|
r = requests.get("https://responsibility-framing-sociolome.hf.space/analyze", {"text": "\n".join(sentences)})
|
|
|
|
lome_analyses = json.loads(r.text)["analyses"]
|
|
|
|
|
|
tmp_in = tempfile.NamedTemporaryFile(mode="w+", delete=False)
|
|
tmp_in.write(json.dumps(lome_analyses))
|
|
tmp_in.close()
|
|
|
|
tmp_out = tempfile.NamedTemporaryFile(mode="w+", delete=False)
|
|
tmp_out.close()
|
|
|
|
|
|
analyze_external_file(tmp_in.name, tmp_out.name, spacy_model_langs[language])
|
|
|
|
with open(tmp_out.name, "r") as f_out:
|
|
data_out = json.load(f_out)
|
|
|
|
os.unlink(tmp_in.name)
|
|
os.unlink(tmp_out.name)
|
|
|
|
return jsonify(data_out)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from waitress import serve
|
|
|
|
if len(sys.argv) > 1:
|
|
host = sys.argv[1]
|
|
else:
|
|
host = "127.0.0.1"
|
|
|
|
debug = False
|
|
if len(sys.argv) > 2:
|
|
if sys.argv[2] == "debug":
|
|
debug = True
|
|
|
|
serve(app, host="0.0.0.0", port="5000")
|
|
|
|
|