Spaces:
Sleeping
Sleeping
Upload 10 files
Browse files- app.py +260 -0
- articulos_indexados.csv +0 -0
- articulos_ultima_semana.csv +0 -0
- carga_articulos.py +20 -0
- config.toml.txt +6 -0
- entrenamiento_modelo.py +50 -0
- preprocesamiento_articulos.py +106 -0
- repartidor_periodicos.jpeg +0 -0
- requirements.txt +14 -0
- resultados_consulta.py +86 -0
app.py
ADDED
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import streamlit as st
|
4 |
+
import streamlit.components.v1 as components
|
5 |
+
from transformers import *
|
6 |
+
from carga_articulos import cargar_articulos
|
7 |
+
from preprocesamiento_articulos import limpieza_articulos, remove_URL, remove_html_markup, remove_emoji, remover_casos_especiales, frases_remover, obtener_kpes
|
8 |
+
from entrenamiento_modelo import term_document_matrix, tf_idf_score
|
9 |
+
from resultados_consulta import resultados_consulta, detalles_resultados
|
10 |
+
import tensorflow as tf
|
11 |
+
import tensorflow.python.ops.numpy_ops.np_config as np_config
|
12 |
+
from math import ceil
|
13 |
+
from datetime import datetime
|
14 |
+
|
15 |
+
###
|
16 |
+
|
17 |
+
def split_frame(input_df, rows):
|
18 |
+
df=[]
|
19 |
+
for i in range(0, len(input_df), rows):
|
20 |
+
df.append(input_df.iloc[i : i + rows, :])
|
21 |
+
return df
|
22 |
+
|
23 |
+
def paginar_frame(df):
|
24 |
+
N_cards_per_row = 1
|
25 |
+
for n_row, row in df.reset_index().iterrows():
|
26 |
+
i = n_row%N_cards_per_row
|
27 |
+
if i==0:
|
28 |
+
st.write("---")
|
29 |
+
cols = st.columns(N_cards_per_row, gap="large")
|
30 |
+
# draw the card
|
31 |
+
with cols[n_row%N_cards_per_row]:
|
32 |
+
if 'answer' in row:
|
33 |
+
if (row['answer']):
|
34 |
+
t= row['answer'] + ' (score: ' + str(row['score']) + ')'
|
35 |
+
st.info(t)
|
36 |
+
row['resumen']=remove_html_markup(row['resumen'])
|
37 |
+
row['resumen']=remove_URL(row['resumen'])
|
38 |
+
if (len(row['resumen'])>600):
|
39 |
+
row['resumen']=row['resumen'][0:600]
|
40 |
+
st.caption(f"{row['feed'].strip()} - {row['seccion'].strip()} - {row['fecha'].strip()} ")
|
41 |
+
st.markdown(f"**{row['titulo'].strip()}**")
|
42 |
+
st.markdown(f"{row['resumen'].strip()}")
|
43 |
+
st.markdown(f"{row['link']}")
|
44 |
+
|
45 |
+
def load_qa_model():
|
46 |
+
|
47 |
+
tokenizer = AutoTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased', use_fast="false")
|
48 |
+
model = TFAutoModelForQuestionAnswering.from_pretrained("Lisibonny/modelo_qa_beto_squad_es_pdqa")
|
49 |
+
return tokenizer, model
|
50 |
+
|
51 |
+
# 4. Use streamlit to create a web app
|
52 |
+
def main():
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
st.set_page_config(page_title="Buscador de noticias periodicos dominicanos", page_icon="📰", layout="centered")
|
57 |
+
st.image('repartidor_periodicos.jpeg', width=150)
|
58 |
+
st.header('El Repartidor Dominicano :red[experimental]')
|
59 |
+
|
60 |
+
df, fecha_min, fecha_max=cargar_articulos()
|
61 |
+
fecha_min=fecha_min[:19]
|
62 |
+
fecha_max=fecha_max[:19]
|
63 |
+
|
64 |
+
fecha_min=datetime.strptime(fecha_min, '%Y-%m-%d %H:%M:%S')
|
65 |
+
fecha_max=datetime.strptime(fecha_max, '%Y-%m-%d %H:%M:%S')
|
66 |
+
|
67 |
+
days=(fecha_max-fecha_min).days
|
68 |
+
fecha_min=fecha_min.strftime("%d-%m-%Y %I:%M %p")
|
69 |
+
fecha_max=fecha_max.strftime("%d-%m-%Y %I:%M %p")
|
70 |
+
|
71 |
+
usar_barra_progreso=1
|
72 |
+
|
73 |
+
# Sidebar
|
74 |
+
st.sidebar.header("Acerca De")
|
75 |
+
st.sidebar.markdown(
|
76 |
+
"El Repartidor Dominicano es un sistema de recuperación de información desde periódicos dominicanos que usa técnicas de aprendizaje automático."
|
77 |
+
)
|
78 |
+
st.sidebar.markdown("Desarrollado por [Lisibonny Beato-Castro](https://scholar.google.com/citations?user=KSzjfeUAAAAJ&hl=es&oi=ao)")
|
79 |
+
|
80 |
+
st.sidebar.header("Artículos Indexados")
|
81 |
+
st.sidebar.markdown(
|
82 |
+
"""
|
83 |
+
Fuentes:
|
84 |
+
|
85 |
+
- [Diario Libre](https://www.diariolibre.com/)
|
86 |
+
- [El Nacional](https://www.elnacional.com.do/)
|
87 |
+
- [Remolacha.net](https://www.remolacha.net/)
|
88 |
+
- [AlMomento.net](https://almomento.net/)
|
89 |
+
- [Gente Tuya](http://www.gentetuya.com)
|
90 |
+
|
91 |
+
"""
|
92 |
+
)
|
93 |
+
st.sidebar.markdown(f"Noticias de los últimos: **{days} días**")
|
94 |
+
st.sidebar.markdown(f"Fecha más antigua: **{fecha_min}**")
|
95 |
+
st.sidebar.markdown(f"Fecha más reciente: **{fecha_max}**")
|
96 |
+
st.sidebar.header("Aviso Legal Sobre Uso de Datos")
|
97 |
+
st.sidebar.markdown(
|
98 |
+
"""
|
99 |
+
El uso de los artículos en este sitio tiene fines no comerciales, respetando los derechos de autor. Implementamos las mejores prácticas para el uso de RSS, tal y como son recomendadas por el Berkman Klein Center for Internet & Society de la Universidad de Harvard.
|
100 |
+
|
101 |
+
Si quieres saber más acerca de los feeds RSS o de las mejores prácticas para el uso de RSS, haz clic en los siguientes enlaces:
|
102 |
+
|
103 |
+
- [RSS](https://es.wikipedia.org/wiki/RSS)
|
104 |
+
- [Uso legal de feeds RSS](https://cyber.harvard.edu/publications/2010/news_aggregator_legal_implications_best_practices)
|
105 |
+
"""
|
106 |
+
)
|
107 |
+
|
108 |
+
st.sidebar.header("¡Cómprame un Café!")
|
109 |
+
st.sidebar.markdown("Si te gusta este sitio y quieres darme las gracias o animarme a hacer más, puedes hacer una pequeña donación.")
|
110 |
+
with st.sidebar:
|
111 |
+
st.markdown("[](https://www.paypal.com/donate/?hosted_button_id=VK5ZAB52ZYDNA)")
|
112 |
+
|
113 |
+
|
114 |
+
articulos_indexados = pd.read_csv('articulos_indexados.csv')
|
115 |
+
articulos_indexados = articulos_indexados.set_index('Unnamed: 0')
|
116 |
+
tokenizer, qa_model = load_qa_model()
|
117 |
+
kpes=obtener_kpes(df)
|
118 |
+
|
119 |
+
query = st.text_input(
|
120 |
+
"Escribe tus términos de búsqueda o haz una pregunta usando los caracteres ¿?:"
|
121 |
+
)
|
122 |
+
|
123 |
+
# Topicos populares
|
124 |
+
st.write("Tópicos populares en los artículos indexados:")
|
125 |
+
cadena = ':blue['
|
126 |
+
for value in kpes:
|
127 |
+
cadena = cadena + ' - ' + str(value[0])
|
128 |
+
cadena=cadena + ']'
|
129 |
+
st.write(cadena)
|
130 |
+
|
131 |
+
if query:
|
132 |
+
|
133 |
+
# Si se especifico una pregunta
|
134 |
+
|
135 |
+
if (('¿' == query[0]) and ('?' == query[len(query)-1])):
|
136 |
+
st.write("Contestando a: ", query)
|
137 |
+
# Verificando cada resumen de los articulos como contexto a la pregunta
|
138 |
+
cantidad_respuestas = 0
|
139 |
+
lista_noticias_respuestas = []
|
140 |
+
all_results = pd.DataFrame(columns=["id","answer","score","start","end"])
|
141 |
+
df_answer=df
|
142 |
+
df_answer['answer']=''
|
143 |
+
df_answer['score'] =0
|
144 |
+
|
145 |
+
|
146 |
+
progress_text = "Buscando respuestas. Por favor, espere."
|
147 |
+
my_bar = st.progress(0, text=progress_text)
|
148 |
+
total_respuestas = len(df_answer)
|
149 |
+
for i in range(total_respuestas):
|
150 |
+
|
151 |
+
text=remove_html_markup(df_answer.loc[i, "resumen"])
|
152 |
+
text=remove_URL(text)
|
153 |
+
text=remove_emoji(text)
|
154 |
+
text=frases_remover(text)
|
155 |
+
text=remover_casos_especiales(text)
|
156 |
+
|
157 |
+
inputs = tokenizer(query, text[0:512], return_tensors='tf')
|
158 |
+
input_ids = inputs["input_ids"].numpy()[0]
|
159 |
+
|
160 |
+
text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
161 |
+
outputs = qa_model(inputs)
|
162 |
+
answer_start = tf.argmax(outputs.start_logits, axis=1).numpy()[0]
|
163 |
+
answer_end = (tf.argmax(outputs.end_logits, axis=1) + 1).numpy()[0]
|
164 |
+
|
165 |
+
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
|
166 |
+
|
167 |
+
all_results.loc[i] = i,answer, max(outputs.start_logits.numpy()[0]), 0, 0
|
168 |
+
|
169 |
+
# Barra de progreso
|
170 |
+
if (usar_barra_progreso==1):
|
171 |
+
porcentaje_progreso = round((i/total_respuestas)*100)
|
172 |
+
if (porcentaje_progreso in range (1,101)):
|
173 |
+
my_bar.progress(porcentaje_progreso, text=progress_text)
|
174 |
+
|
175 |
+
my_bar.empty()
|
176 |
+
usar_barra_progreso = 0
|
177 |
+
|
178 |
+
# Obteniendo las respuestas con los 10 scores mas altos
|
179 |
+
all_results=all_results.sort_values(by=['score'], ascending=False).head(5)
|
180 |
+
# Si hay alguna de ellas que diga que no hay respuesta, no se traera ninguna
|
181 |
+
if not (all_results['answer'].isnull().any()):
|
182 |
+
for index, row in all_results.iterrows():
|
183 |
+
if (len(row['answer'])>0):
|
184 |
+
cantidad_respuestas = cantidad_respuestas + 1
|
185 |
+
i=row['id']
|
186 |
+
df_answer.loc[i, "answer"] = row.loc['answer']
|
187 |
+
df_answer.loc[i, "score"]= row.loc['score']
|
188 |
+
lista_noticias_respuestas.append(df_answer.loc[i].to_frame().T)
|
189 |
+
|
190 |
+
df_noticias_respuestas=pd.concat(lista_noticias_respuestas)
|
191 |
+
batch_size = 5
|
192 |
+
pages = split_frame(df_noticias_respuestas, batch_size)
|
193 |
+
top_menu = st.columns(3)
|
194 |
+
|
195 |
+
pagination = st.container()
|
196 |
+
|
197 |
+
bottom_menu = st.columns((3))
|
198 |
+
|
199 |
+
with pagination:
|
200 |
+
|
201 |
+
with bottom_menu[2]:
|
202 |
+
total_pages = (ceil(cantidad_respuestas / batch_size) if ceil(cantidad_respuestas / batch_size) > 0 else 1)
|
203 |
+
current_page = st.number_input("Página", min_value=1, max_value=total_pages, step=1)
|
204 |
+
|
205 |
+
with bottom_menu[1]:
|
206 |
+
st.write("---")
|
207 |
+
st.markdown(f"Página **{current_page}** de **{total_pages}** ")
|
208 |
+
|
209 |
+
with top_menu[0]:
|
210 |
+
pagina_res_fin= batch_size*current_page if batch_size*current_page <= cantidad_respuestas else cantidad_respuestas
|
211 |
+
st.markdown(f"Respuestas **{(current_page*batch_size)-batch_size+1}-{pagina_res_fin}** de **{cantidad_respuestas}** ")
|
212 |
+
|
213 |
+
paginar_frame(pages[current_page - 1])
|
214 |
+
|
215 |
+
|
216 |
+
|
217 |
+
# Si se especificaron keywords
|
218 |
+
else:
|
219 |
+
|
220 |
+
st.write("Buscando: ", query)
|
221 |
+
result = resultados_consulta(df,articulos_indexados, query)
|
222 |
+
|
223 |
+
if result.empty:
|
224 |
+
st.info("No se encontraron artículos para la búsqueda solicitada")
|
225 |
+
|
226 |
+
else:
|
227 |
+
|
228 |
+
df_results=detalles_resultados(df,result)
|
229 |
+
cantidad_resultados=len(df_results)
|
230 |
+
batch_size = 5
|
231 |
+
pages = split_frame(df_results, batch_size)
|
232 |
+
top_menu = st.columns(3)
|
233 |
+
|
234 |
+
pagination = st.container()
|
235 |
+
|
236 |
+
|
237 |
+
bottom_menu = st.columns((3))
|
238 |
+
|
239 |
+
|
240 |
+
|
241 |
+
with bottom_menu[2]:
|
242 |
+
total_pages = (ceil(cantidad_resultados / batch_size) if ceil(cantidad_resultados / batch_size) > 0 else 1)
|
243 |
+
current_page = st.number_input("Página", min_value=1, max_value=total_pages, step=1)
|
244 |
+
|
245 |
+
with bottom_menu[1]:
|
246 |
+
st.write("---")
|
247 |
+
st.markdown(f"Página **{current_page}** de **{total_pages}** ")
|
248 |
+
|
249 |
+
with top_menu[0]:
|
250 |
+
pagina_res_fin= batch_size*current_page if batch_size*current_page <= cantidad_resultados else cantidad_resultados
|
251 |
+
st.markdown(f"Artículos **{(current_page*batch_size)-batch_size+1}-{pagina_res_fin}** de **{cantidad_resultados}** ")
|
252 |
+
|
253 |
+
with pagination:
|
254 |
+
|
255 |
+
paginar_frame(pages[current_page - 1])
|
256 |
+
|
257 |
+
|
258 |
+
|
259 |
+
if __name__ == "__main__":
|
260 |
+
main()
|
articulos_indexados.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
articulos_ultima_semana.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
carga_articulos.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import streamlit as st
|
3 |
+
from dateutil.parser import parse
|
4 |
+
import locale
|
5 |
+
|
6 |
+
def cargar_articulos():
|
7 |
+
articulos = pd.read_csv('articulos_ultima_semana.csv')
|
8 |
+
#articulos=articulos.iloc[0:250,:] # Cargando solo 250
|
9 |
+
# Eliminando la primera columna
|
10 |
+
articulos.drop("Unnamed: 0", axis=1, inplace=True)
|
11 |
+
# creando columna nueva ID
|
12 |
+
articulos['ID']=articulos.index
|
13 |
+
articulos.ID = pd.Series(["D"+str(ind) for ind in articulos.ID])
|
14 |
+
fechas=articulos['fecha'].apply(lambda x: parse(x))
|
15 |
+
fechas=fechas.astype(str)
|
16 |
+
fecha_minima=fechas.min()
|
17 |
+
fecha_maxima=fechas.max()
|
18 |
+
# Retornando el dataframe con los articulos y algunos datos al respecto
|
19 |
+
return articulos, fecha_minima, fecha_maxima
|
20 |
+
|
config.toml.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[theme]
|
2 |
+
base = 'dark'
|
3 |
+
primaryColor = 'F63366'
|
4 |
+
font = 'sans serif'
|
5 |
+
[server]
|
6 |
+
enableCORS = false
|
entrenamiento_modelo.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
# data = Conjunto de datos. Es un dataframe
|
6 |
+
# vocab = el vocabulario obtenido desde los documentos. Es una lista de cadenas.
|
7 |
+
# document_index = Nombre de la columna del dataframe en donde esta el id de los documentos
|
8 |
+
# text = Nombre de la columna del dataframe donde esta el texto de los documentos
|
9 |
+
|
10 |
+
def term_document_matrix (df, vocab, document_index, text):
|
11 |
+
|
12 |
+
vocab_index = pd.DataFrame(columns=df[document_index], index=vocab).fillna(0)
|
13 |
+
|
14 |
+
for word in vocab_index.index:
|
15 |
+
for doc in df[document_index]:
|
16 |
+
freq = df[df[document_index]==doc][text].values[0].count(word)
|
17 |
+
vocab_index.loc[word,doc]=freq
|
18 |
+
return vocab_index # Devuelve un dataframe con la matriz de los documentos y sus frecuencias
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
# vocab_index = Es el term document matrix que se calcula en la seccion anterior
|
23 |
+
# document_index = Serie conteniendo los ids de los documentos.
|
24 |
+
# inv_df = Nombre de la columna que contiene la frecuencia inversa de los documentos calculada
|
25 |
+
|
26 |
+
|
27 |
+
def tf_idf_score(vocab_index, document_index, inv_df='inverse_document_frequency'):
|
28 |
+
|
29 |
+
total_docx=len(document_index)
|
30 |
+
vocab_index['document_frequency']=vocab_index.sum(axis=1)
|
31 |
+
vocab_index['inverse_document_frequency']=np.log2(total_docx/vocab_index['document_frequency'])
|
32 |
+
|
33 |
+
for word in vocab_index.index:
|
34 |
+
|
35 |
+
for doc in document_index:
|
36 |
+
|
37 |
+
tf_idf=np.log2(1+vocab_index.loc[word,doc]) * np.log2(vocab_index.loc[word][inv_df])
|
38 |
+
vocab_index.loc[word]['tf_idf_'+str(doc)]=tf_idf
|
39 |
+
|
40 |
+
return vocab_index # Devuelve un dataframe que contiene: matriz de los terminos del documento,
|
41 |
+
# la frecuencia de los documentos, la frecuencia inversa de los documentos,
|
42 |
+
# y el score tf_idf
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
def generar_archivo_indexado():
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
return 0
|
preprocesamiento_articulos.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import nltk
|
4 |
+
from nltk.tokenize import word_tokenize, RegexpTokenizer
|
5 |
+
from nltk.corpus import stopwords
|
6 |
+
from nltk.stem import SnowballStemmer
|
7 |
+
import textacy
|
8 |
+
|
9 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
10 |
+
import csv
|
11 |
+
import re
|
12 |
+
|
13 |
+
nltk.download('stopwords')
|
14 |
+
nltk.download('punkt')
|
15 |
+
stopwords_es = stopwords.words('spanish')
|
16 |
+
spanish_stemmer = SnowballStemmer('spanish')
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
+
def remove_html_markup(s):
|
22 |
+
tag = False
|
23 |
+
quote = False
|
24 |
+
out = ""
|
25 |
+
|
26 |
+
for c in s:
|
27 |
+
if c == '<' and not quote:
|
28 |
+
tag = True
|
29 |
+
elif c == '>' and not quote:
|
30 |
+
tag = False
|
31 |
+
elif (c == '"' or c == "'") and tag:
|
32 |
+
quote = not quote
|
33 |
+
elif not tag:
|
34 |
+
out = out + c
|
35 |
+
|
36 |
+
return out
|
37 |
+
|
38 |
+
def remove_URL(s):
|
39 |
+
"""Remove URLs from a sample string"""
|
40 |
+
return re.sub(r"http\S+", "", s)
|
41 |
+
|
42 |
+
def eliminar_puntuacion(articulo):
|
43 |
+
deletetion_symbols = ['!','(',')',"'",'-','[',']','{','}',';',':','"','“','’','”',"'",'`','‘','``','\\' ,'/','|',',','|','<','>','.','..','...','?','@',"#",'$','^','&','*','_','~','+','%','=','¿','¡',"''"]
|
44 |
+
new_articulo = ""
|
45 |
+
for x in articulo:
|
46 |
+
if x not in deletetion_symbols:
|
47 |
+
new_articulo += x
|
48 |
+
return new_articulo
|
49 |
+
|
50 |
+
def remove_emoji(s):
|
51 |
+
regrex_pattern = re.compile(pattern = "["
|
52 |
+
u"\U0001F600-\U0001F64F" # emoticons
|
53 |
+
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
54 |
+
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
55 |
+
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
56 |
+
"]+", flags = re.UNICODE)
|
57 |
+
return regrex_pattern.sub(r'',s)
|
58 |
+
|
59 |
+
def remover_casos_especiales(s):
|
60 |
+
#Removiendo texto que termina con .-, ya que usualmente es un texto que se usa como inicio de algunos articulos
|
61 |
+
s= re.sub(r'^\w+(,)*([\s]\w+)*([\s]\(\w+\))*.-','',s)
|
62 |
+
return s
|
63 |
+
|
64 |
+
def frases_remover(s):
|
65 |
+
lista_frases_remover=['La entrada', 'la entrada', '(Seguir leyendo…)', 'se publicó primero en', 'Remolacha - Noticias Republica Dominicana', 'Read more ›', 'Read more','[…]', 'RELACIONADAS']
|
66 |
+
for l in lista_frases_remover:
|
67 |
+
s = s.replace(l, '')
|
68 |
+
return s
|
69 |
+
|
70 |
+
def eliminar_stopwords(articulo):
|
71 |
+
|
72 |
+
articulo_splitted=articulo.split()
|
73 |
+
new_articulo = ""
|
74 |
+
for x in articulo_splitted:
|
75 |
+
if x not in stopwords_es:
|
76 |
+
new_articulo += " " + x
|
77 |
+
return new_articulo
|
78 |
+
|
79 |
+
def obtener_raices(articulo):
|
80 |
+
|
81 |
+
articulo_splitted=articulo.split()
|
82 |
+
new_articulo = ""
|
83 |
+
for x in articulo_splitted:
|
84 |
+
x_new = spanish_stemmer.stem(x)
|
85 |
+
new_articulo += " " + x_new
|
86 |
+
return new_articulo
|
87 |
+
|
88 |
+
def limpieza_articulos(df):
|
89 |
+
|
90 |
+
df_titulos=pd.DataFrame(df['titulo'], columns=['titulo'])
|
91 |
+
# Colocando texto en minusculas
|
92 |
+
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: x.lower())
|
93 |
+
# Eliminando signos de puntuacion
|
94 |
+
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_puntuacion(x))
|
95 |
+
# Eliminando palabras vacias (stopwords) utilizando el corpus para estos fines que tiene nltk
|
96 |
+
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_stopwords(x))
|
97 |
+
all_text = ' '. join(df_titulos['titulo'])
|
98 |
+
vocab= np.unique(word_tokenize(all_text))
|
99 |
+
return vocab
|
100 |
+
|
101 |
+
def obtener_kpes(df):
|
102 |
+
df_titulos=pd.DataFrame(df['titulo'], columns=['titulo'])
|
103 |
+
all_text = '. '. join(df_titulos['titulo'])
|
104 |
+
titulos=textacy.make_spacy_doc(all_text, lang='es_core_news_sm')
|
105 |
+
return textacy.extract.keyterms.textrank(titulos,normalize='lower',topn=10)
|
106 |
+
|
repartidor_periodicos.jpeg
ADDED
![]() |
requirements.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi[all]
|
2 |
+
openai
|
3 |
+
python-dotenv
|
4 |
+
pydantic==1.*
|
5 |
+
langchain
|
6 |
+
bs4
|
7 |
+
tiktoken
|
8 |
+
nltk
|
9 |
+
scikit-learn
|
10 |
+
transformers
|
11 |
+
tf-keras
|
12 |
+
accelerate
|
13 |
+
textacy
|
14 |
+
https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0.tar.gz
|
resultados_consulta.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from preprocesamiento_articulos import remove_URL, remove_html_markup, eliminar_puntuacion, eliminar_stopwords, obtener_raices
|
4 |
+
import streamlit as st
|
5 |
+
|
6 |
+
|
7 |
+
# La idea aqui es colocar el query en minusculas y limpiar el query de palabras vacias y algunos caracteres
|
8 |
+
|
9 |
+
def query_processing (query):
|
10 |
+
query=eliminar_puntuacion(query) # Quitando signos de puntuacion
|
11 |
+
query = query.strip().lower() # Minusculas
|
12 |
+
query = eliminar_stopwords(query)
|
13 |
+
query = obtener_raices(query)
|
14 |
+
return query
|
15 |
+
|
16 |
+
|
17 |
+
# vocab_index = Matriz de terminos del documento, con todas las frecuencias calculadas. En nuestro caso
|
18 |
+
# es lo que tenemos en el csv. Se debe cargar a un dataframe.
|
19 |
+
# Query = Consulta realizada
|
20 |
+
|
21 |
+
def query_score(vocab_index, query):
|
22 |
+
for word in np.unique(query.split()):
|
23 |
+
freq=query.count(word)
|
24 |
+
if word in vocab_index.index:
|
25 |
+
tf_idf = np.log2(1+freq) * np.log2(vocab_index.loc[word].inverse_document_frequency)
|
26 |
+
vocab_index.loc[word,"query_tf_idf"] = tf_idf
|
27 |
+
vocab_index['query_tf_idf'].fillna(0, inplace=True)
|
28 |
+
|
29 |
+
return vocab_index # Matriz tf_idf para los terminos de los documentos y para el los terminos del query. Es un DF.
|
30 |
+
|
31 |
+
|
32 |
+
# vocab_index = DataFrame que contiene los scores tf-idf por termino para cada documento y para cada query
|
33 |
+
# document_index = Lista de los IDs de los documentos
|
34 |
+
# query_scores = Nombre de la columna del dataframe que contiene los scores tf_idf del query
|
35 |
+
|
36 |
+
|
37 |
+
def cosine_similarity(vocab_index, document_index, query_scores):
|
38 |
+
|
39 |
+
cosine_scores = {}
|
40 |
+
|
41 |
+
query_scalar = np.sqrt(sum(vocab_index[query_scores] ** 2))
|
42 |
+
|
43 |
+
for doc in document_index:
|
44 |
+
|
45 |
+
doc_scalar = np.sqrt(sum(vocab_index[str(doc)] ** 2))
|
46 |
+
dot_prod = sum(vocab_index[str(doc)] * vocab_index[query_scores])
|
47 |
+
cosine = (dot_prod / (query_scalar * doc_scalar))
|
48 |
+
|
49 |
+
cosine_scores[doc] = cosine
|
50 |
+
|
51 |
+
return pd.Series(cosine_scores) # Es una serie pandas que devuelve las puntuaciones de similitud del query para cada
|
52 |
+
# documento
|
53 |
+
|
54 |
+
|
55 |
+
# data: Dataframe que contiene los ids y el texto de los documentos
|
56 |
+
# cosine_scores: Serie que contiene los scores de coseno de los documentos
|
57 |
+
# document_index: Nombre de la columna que contiene los ids de los documentos en el dataframe data
|
58 |
+
|
59 |
+
def retrieve_index(data,cosine_scores, document_index, topn=10):
|
60 |
+
|
61 |
+
data = data.set_index(document_index)
|
62 |
+
data['scores'] = cosine_scores
|
63 |
+
df_top_scores=data.reset_index().sort_values('scores',ascending=False).head(topn)
|
64 |
+
cutoff=np.average(df_top_scores['scores'])+0.75*np.std(df_top_scores['scores']) # Se usa una formula sugerida en la literatura como score de corte
|
65 |
+
df_top_scores=df_top_scores[df_top_scores['scores'] > cutoff]
|
66 |
+
|
67 |
+
return df_top_scores.index # Dataframe original con los scores
|
68 |
+
# del coseno en una columna nueva
|
69 |
+
|
70 |
+
|
71 |
+
def resultados_consulta(df,articulos_indexados, query):
|
72 |
+
indices = pd.Index([], dtype='int64')
|
73 |
+
query=query_processing(query)
|
74 |
+
qs=query_score(articulos_indexados,query)
|
75 |
+
if 'query_tf_idf' in qs.columns:
|
76 |
+
cosenos = cosine_similarity(qs, df['ID'].values, 'query_tf_idf')
|
77 |
+
indices = retrieve_index(df, cosenos, 'ID', len(df))
|
78 |
+
return indices
|
79 |
+
|
80 |
+
def detalles_resultados(df,indices):
|
81 |
+
top=df.loc[indices]
|
82 |
+
top['resumen']=top['resumen'].apply(lambda x: remove_html_markup(x))
|
83 |
+
top['resumen']=top['resumen'].apply(lambda x: remove_URL(x))
|
84 |
+
top['resumen']=top['resumen'].apply(lambda x: x[0:600] + '[...]' if len(x)>600 else x)
|
85 |
+
top=top.loc[:,['titulo', 'link', 'fecha', 'resumen', 'seccion', 'feed']]
|
86 |
+
return top
|