File size: 11,425 Bytes
579b090 7988fb1 579b090 7988fb1 579b090 7988fb1 579b090 7988fb1 579b090 7988fb1 579b090 7988fb1 579b090 7988fb1 579b090 7988fb1 579b090 7988fb1 579b090 7988fb1 579b090 7988fb1 579b090 7988fb1 579b090 7988fb1 579b090 7988fb1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 |
from haystack.nodes.base import BaseComponent
from haystack.schema import Document
from haystack.nodes import ImageToTextConverter, PDFToTextConverter
from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
from pdf2image import convert_from_path
from typing import Callable, Dict, List, Optional, Text, Tuple, Union
from typing_extensions import Literal
import pandas as pd
import logging
import re
import string
from haystack.pipelines import Pipeline
import streamlit as st
def useOCR(file_path: str)-> Text:
Converts image pdfs into text, Using the Farm-haystack[OCR]
file_path: file_path of uploade file, returned by add_upload function in
Returns the text file as string.
# we need pdf file to be first converted into image file
# this will create each page as image file
images = convert_from_path(pdf_path = file_path)
list_ = []
# save image file in cache and read them one by one to pass it to OCR
for i, pdf in enumerate(images):
# Save pages as images in the pdf'PDF\image_converted_{i+1}.png', 'PNG')
converter = ImageToTextConverter(remove_numeric_tables=True,
# placeholder to collect the text from each page
placeholder = []
for file in list_:
document = converter.convert(
file_path=file, meta=None,
text = document.content
# join the text from each page by page separator
text = '\x0c'.join(placeholder)
return text
class FileConverter(BaseComponent):
Wrapper class to convert uploaded document into text by calling appropriate
Converter class, will use internally haystack PDFToTextOCR in case of image
pdf. Cannot use the FileClassifier from haystack as its doesnt has any
label/output class for image.
outgoing_edges = 1
def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
id_hash_keys: Optional[List[str]] = None,
) -> Tuple[dict,str]:
""" this is required method to invoke the component in
the pipeline implementation.
file_name: name of file
file_path: file_path of uploade file, returned by add_upload function in
See the links provided in Class docstring/description to see other params
output: dictionary, with key as identifier and value could be anything
we need to return. In this case its the List of Hasyatck Document
output_1: As there is only one outgoing edge, we pass 'output_1' string
if file_name.endswith('.pdf'):
converter = PDFToTextConverter(remove_numeric_tables=True)
if file_name.endswith('.txt'):
converter = TextConverter(remove_numeric_tables=True)
if file_name.endswith('.docx'):
converter = DocxToTextConverter()
except Exception as e:
documents = []
document = converter.convert(
file_path=file_path, meta=None,
encoding=encoding, id_hash_keys=id_hash_keys
text = document.content
# in case of scanned/images only PDF the content might contain only
# the page separator (\f or \x0c). We check if is so and use
# use the OCR to get the text.
filtered = re.sub(r'\x0c', '', text)
if filtered == "":"Using OCR")
text = useOCR(file_path)
meta={"name": file_name},
id_hash_keys=id_hash_keys))'file conversion succesful')
output = {'documents': documents}
return output, 'output_1'
def run_batch():
we dont have requirement to process the multiple files in one go
therefore nothing here, however to use the custom node we need to have
this method for the class.
def basic(s:str, remove_punc:bool = False):
Performs basic cleaning of text.
s: string to be processed
removePunc: to remove all Punctuation including ',' and '.' or not
Returns: processed string: see comments in the source code for more info
# Remove URLs
s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
s = re.sub(r"http\S+", " ", s)
# Remove new line characters
s = re.sub('\n', ' ', s)
# Remove punctuations
if remove_punc == True:
translator = str.maketrans(' ', ' ', string.punctuation)
s = s.translate(translator)
# Remove distracting single quotes and dotted pattern
s = re.sub("\'", " ", s)
s = s.replace("..","")
return s.strip()
def paraLengthCheck(paraList, max_len = 100):
There are cases where preprocessor cannot respect word limit, when using
respect sentence boundary flag due to missing sentence boundaries.
Therefore we run one more round of split here for those paragraphs
paraList : list of paragraphs/text
max_len : max length to be respected by sentences which bypassed
preprocessor strategy
new_para_list = []
for passage in paraList:
# check if para exceeds words limit
if len(passage.content.split()) > max_len:
# we might need few iterations example if para = 512 tokens
# we need to iterate 5 times to reduce para to size limit of '100'
iterations = int(len(passage.content.split())/max_len)
for i in range(iterations):
temp = " ".join(passage.content.split()[max_len*i:max_len*(i+1)])
temp = " ".join(passage.content.split()[max_len*(i+1):])
# paragraphs which dont need any splitting
new_para_list.append((passage.content, passage.meta['page']))"New paragraphs length {}".format(len(new_para_list)))
return new_para_list
class UdfPreProcessor(BaseComponent):
class to preprocess the document returned by FileConverter. It will check
for splitting strategy and splits the document by word or sentences and then
synthetically create the paragraphs.
outgoing_edges = 1
def run(self, documents:List[Document], remove_punc:bool=False, apply_clean = True,
split_by: Literal["sentence", "word"] = 'sentence',
split_length:int = 2, split_respect_sentence_boundary:bool = False,
split_overlap:int = 0):
""" this is required method to invoke the component in
the pipeline implementation.
documents: documents from the output dictionary returned by Fileconverter
remove_punc: to remove all Punctuation including ',' and '.' or not
split_by: document splitting strategy either as word or sentence
split_length: when synthetically creating the paragrpahs from document,
it defines the length of paragraph.
split_respect_sentence_boundary: Used when using 'word' strategy for
splititng of text.
split_overlap: Number of words or sentences that overlap when creating
the paragraphs. This is done as one sentence or 'some words' make sense
when read in together with others. Therefore the overlap is used.
output: dictionary, with key as identifier and value could be anything
we need to return. In this case the output will contain 4 objects
the paragraphs text list as List, Haystack document, Dataframe and
one raw text file.
output_1: As there is only one outgoing edge, we pass 'output_1' string
if split_by == 'sentence':
split_respect_sentence_boundary = False
split_respect_sentence_boundary = split_respect_sentence_boundary
preprocessor = PreProcessor(
split_respect_sentence_boundary= split_respect_sentence_boundary,
# will add page number only in case of PDF not for text/docx file.
for i in documents:
# # basic cleaning before passing it to preprocessor.
# i = basic(i)
docs_processed = preprocessor.process([i])
if apply_clean:
for item in docs_processed:
item.content = basic(item.content, remove_punc= remove_punc)
df = pd.DataFrame(docs_processed)
all_text = " ".join(df.content.to_list())
para_list = df.content.to_list()'document split into {} paragraphs'.format(len(para_list)))
output = {'documents': docs_processed,
'dataframe': df,
'text': all_text,
'paraList': para_list
return output, "output_1"
def run_batch():
we dont have requirement to process the multiple files in one go
therefore nothing here, however to use the custom node we need to have
this method for the class.
def processingpipeline():
Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
from utils.preprocessing
preprocessing_pipeline = Pipeline()
file_converter = FileConverter()
custom_preprocessor = UdfPreProcessor()
name="FileConverter", inputs=["File"])
preprocessing_pipeline.add_node(component = custom_preprocessor,
name ='UdfPreProcessor', inputs=["FileConverter"])
return preprocessing_pipeline |