Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
import nltk | |
from nltk.tokenize import word_tokenize, RegexpTokenizer | |
from nltk.corpus import stopwords | |
from nltk.stem import SnowballStemmer | |
import textacy | |
from sklearn.feature_extraction.text import CountVectorizer | |
import csv | |
import re | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
stopwords_es = stopwords.words('spanish') | |
spanish_stemmer = SnowballStemmer('spanish') | |
def remove_html_markup(s): | |
tag = False | |
quote = False | |
out = "" | |
for c in s: | |
if c == '<' and not quote: | |
tag = True | |
elif c == '>' and not quote: | |
tag = False | |
elif (c == '"' or c == "'") and tag: | |
quote = not quote | |
elif not tag: | |
out = out + c | |
return out | |
def remove_URL(s): | |
"""Remove URLs from a sample string""" | |
return re.sub(r"http\S+", "", s) | |
def eliminar_puntuacion(articulo): | |
deletetion_symbols = ['!','(',')',"'",'-','[',']','{','}',';',':','"','“','’','”',"'",'`','‘','``','\\' ,'/','|',',','|','<','>','.','..','...','?','@',"#",'$','^','&','*','_','~','+','%','=','¿','¡',"''"] | |
new_articulo = "" | |
for x in articulo: | |
if x not in deletetion_symbols: | |
new_articulo += x | |
return new_articulo | |
def remove_emoji(s): | |
regrex_pattern = re.compile(pattern = "[" | |
u"\U0001F600-\U0001F64F" # emoticons | |
u"\U0001F300-\U0001F5FF" # symbols & pictographs | |
u"\U0001F680-\U0001F6FF" # transport & map symbols | |
u"\U0001F1E0-\U0001F1FF" # flags (iOS) | |
"]+", flags = re.UNICODE) | |
return regrex_pattern.sub(r'',s) | |
def remover_casos_especiales(s): | |
#Removiendo texto que termina con .-, ya que usualmente es un texto que se usa como inicio de algunos articulos | |
s= re.sub(r'^\w+(,)*([\s]\w+)*([\s]\(\w+\))*.-','',s) | |
return s | |
def frases_remover(s): | |
lista_frases_remover=['La entrada', 'la entrada', '(Seguir leyendo…)', 'se publicó primero en', 'Remolacha - Noticias Republica Dominicana', 'Read more ›', 'Read more','[…]', 'RELACIONADAS'] | |
for l in lista_frases_remover: | |
s = s.replace(l, '') | |
return s | |
def eliminar_stopwords(articulo): | |
articulo_splitted=articulo.split() | |
new_articulo = "" | |
for x in articulo_splitted: | |
if x not in stopwords_es: | |
new_articulo += " " + x | |
return new_articulo | |
def obtener_raices(articulo): | |
articulo_splitted=articulo.split() | |
new_articulo = "" | |
for x in articulo_splitted: | |
x_new = spanish_stemmer.stem(x) | |
new_articulo += " " + x_new | |
return new_articulo | |
def limpieza_articulos(df): | |
df_titulos=pd.DataFrame(df['titulo'], columns=['titulo']) | |
# Colocando texto en minusculas | |
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: x.lower()) | |
# Eliminando signos de puntuacion | |
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_puntuacion(x)) | |
# Eliminando palabras vacias (stopwords) utilizando el corpus para estos fines que tiene nltk | |
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_stopwords(x)) | |
all_text = ' '. join(df_titulos['titulo']) | |
vocab= np.unique(word_tokenize(all_text)) | |
return vocab | |
def obtener_kpes(df): | |
df_titulos=pd.DataFrame(df['titulo'], columns=['titulo']) | |
all_text = '. '. join(df_titulos['titulo']) | |
titulos=textacy.make_spacy_doc(all_text, lang='es_core_news_sm') | |
return textacy.extract.keyterms.textrank(titulos,normalize='lower',topn=10) | |