wbrule / haystack-api /pipelines_biobert.haystack-pipeline.yml
intoxication's picture
Upload 12 files
3fe47db
version: ignore
components:
- name: DocumentStore
type: ElasticsearchDocumentStore
params:
host: localhost
- name: Retriever # Selects the most relevant documents from the document store and passes them on to the Reader
type: EmbeddingRetriever # Uses a Transformer model to encode the document and the query
params:
document_store: DocumentStore
embedding_model: sentence-transformers/multi-qa-mpnet-base-dot-v1 # multi-qa-MiniLM-L6-dot-v1
embed_meta_fields:
- filename
top_k: 10 # The number of results to return
- name: BM25
type: BM25Retriever
params:
document_store: DocumentStore
top_k: 10
- name: Joiner
type: JoinDocuments
params:
join_mode: reciprocal_rank_fusion
- name: Reader # The component that actually fetches answers from among the 20 documents returned by retriever
type: FARMReader # Transformer-based reader, specializes in extractive QA
params:
model_name_or_path: dmis-lab/biobert-large-cased-v1.1-squad # dmis-lab/biobert-base-cased-v1.1-squad
context_window_size: 700 # The size of the window around the answer span
- name: FileTypeClassifier # Routes files based on their extension to appropriate converters, by default txt, pdf, md, docx, html
type: FileTypeClassifier
- name: TextConverter # Converts files into documents
type: TextConverter
- name: PDFConverter # Converts PDFs into documents
type: PDFToTextConverter
- name: Preprocessor # Splits documents into smaller ones and cleans them up
type: PreProcessor
params:
# With a vector-based retriever, it's good to split your documents into smaller ones
split_by: word # The unit by which you want to split the documents
split_length: 250 # The max number of words in a document
split_overlap: 20 # Enables the sliding window approach
split_respect_sentence_boundary: True # Retains complete sentences in split documents
language: en # Used by NLTK to best detect the sentence boundaries for that language
# Here you define how the nodes are organized in the pipelines
# For each node, specify its input
pipelines:
- name: query
nodes:
- name: Retriever
inputs: [Query]
- name: BM25
inputs: [Query]
- name: Joiner
inputs: [Retriever, BM25]
- name: Reader
inputs: [Joiner]
- name: indexing
nodes:
# Depending on the file type, we use a Text or PDF converter
- name: FileTypeClassifier
inputs: [File]
- name: TextConverter
inputs: [FileTypeClassifier.output_1] # Ensures this converter receives TXT files
- name: PDFConverter
inputs: [FileTypeClassifier.output_2] # Ensures this converter receives PDFs
- name: Preprocessor
inputs: [TextConverter, PDFConverter]
- name: Retriever
inputs: [Preprocessor]
- name: DocumentStore
inputs: [Retriever]