Spaces:
Sleeping
Sleeping
File size: 2,018 Bytes
9833a80 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on
@author:
@title: clean_dataset
@descriptions: set of functions that enable splitting and cleaning.
"""
#%%
import pandas as pd
import numpy as np
import string
from itertools import chain
from textwrap3 import wrap
import re
def split_at_length(dataframe, column, length, title = True):
wrapped = []
for i in dataframe[column]:
wrapped.append(wrap(str(i), length))
dataframe = dataframe.assign(wrapped=wrapped)
dataframe['wrapped'] = dataframe['wrapped'].apply(lambda x: '; '.join(map(str, x)))
if title == True:
splitted = pd.concat([pd.Series(row['title'], row['wrapped'].split("; "), )
for _, row in dataframe.iterrows()]).reset_index()
splitted = splitted.rename(columns={"index": "text", 0: "title"})
else:
splitted = []
return dataframe, splitted
def basic(s):
"""
:param s: string to be processed
:return: processed string: see comments in the source code for more info
"""
# Text Lowercase
s = s.lower()
# Remove punctuation
translator = str.maketrans(' ', ' ', string.punctuation)
s = s.translate(translator)
# Remove URLs
s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
s = re.sub(r"http\S+", " ", s)
# Remove new line characters
s = re.sub('\n', ' ', s)
# Remove distracting single quotes
s = re.sub("\'", " ", s)
# Remove all remaining numbers and non alphanumeric characters
s = re.sub(r'\d+', ' ', s)
s = re.sub(r'\W+', ' ', s)
# define custom words to replace:
#s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
return s.strip()
def remove_linebreaks(s):
"""
:param s: string to be processed
:return: processed string: see comments in the source code for more info
"""
# Remove new line characters
s = re.sub('\n', ' ', s)
return s.strip() |