XThomasBU commited on
Commit
bbd24f7
·
1 Parent(s): 36cd3f9

mior updates

Browse files
code/modules/config/config.yml CHANGED
@@ -37,7 +37,7 @@ llm_params:
37
  temperature: 0.7 # float
38
  repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id
39
  filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
40
- pdf_reader: 'llama' # str [llama, pymupdf, gpt]
41
  stream: False # bool
42
 
43
  chat_logging:
 
37
  temperature: 0.7 # float
38
  repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id
39
  filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
40
+ pdf_reader: 'pymupdf' # str [llama, pymupdf, gpt]
41
  stream: False # bool
42
 
43
  chat_logging:
code/modules/dataloader/data_loader.py CHANGED
@@ -63,7 +63,7 @@ class HTMLReader:
63
  href = href.replace("http", "https")
64
 
65
  absolute_url = urljoin(base_url, href)
66
- link['href'] = absolute_url
67
 
68
  resp = requests.head(absolute_url)
69
  if resp.status_code != 200:
@@ -84,6 +84,7 @@ class HTMLReader:
84
  else:
85
  return None
86
 
 
87
  class FileReader:
88
  def __init__(self, logger, kind):
89
  self.logger = logger
@@ -95,8 +96,9 @@ class FileReader:
95
  else:
96
  self.pdf_reader = PDFReader()
97
  self.web_reader = HTMLReader()
98
- self.logger.info(f"Initialized FileReader with {kind} PDF reader and HTML reader")
99
-
 
100
 
101
  def extract_text_from_pdf(self, pdf_path):
102
  text = ""
@@ -374,7 +376,9 @@ class ChunkProcessor:
374
 
375
  class DataLoader:
376
  def __init__(self, config, logger=None):
377
- self.file_reader = FileReader(logger=logger, kind=config["llm_params"]["pdf_reader"])
 
 
378
  self.chunk_processor = ChunkProcessor(config, logger=logger)
379
 
380
  def get_chunks(self, uploaded_files, weblinks):
@@ -392,19 +396,22 @@ if __name__ == "__main__":
392
  with open("../code/modules/config/config.yml", "r") as f:
393
  config = yaml.safe_load(f)
394
 
395
- STORAGE_DIR = os.path.join(BASE_DIR, config['vectorstore']["data_path"])
396
  uploaded_files = [
397
- os.path.join(STORAGE_DIR, file) for file in os.listdir(STORAGE_DIR) if file != "urls.txt"
 
 
398
  ]
399
 
400
  data_loader = DataLoader(config, logger=logger)
401
  document_chunks, document_names, documents, document_metadata = (
402
  data_loader.get_chunks(
403
- ["https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf"],
 
 
404
  [],
405
  )
406
  )
407
 
408
  print(document_names[:5])
409
  print(len(document_chunks))
410
-
 
63
  href = href.replace("http", "https")
64
 
65
  absolute_url = urljoin(base_url, href)
66
+ link["href"] = absolute_url
67
 
68
  resp = requests.head(absolute_url)
69
  if resp.status_code != 200:
 
84
  else:
85
  return None
86
 
87
+
88
  class FileReader:
89
  def __init__(self, logger, kind):
90
  self.logger = logger
 
96
  else:
97
  self.pdf_reader = PDFReader()
98
  self.web_reader = HTMLReader()
99
+ self.logger.info(
100
+ f"Initialized FileReader with {kind} PDF reader and HTML reader"
101
+ )
102
 
103
  def extract_text_from_pdf(self, pdf_path):
104
  text = ""
 
376
 
377
  class DataLoader:
378
  def __init__(self, config, logger=None):
379
+ self.file_reader = FileReader(
380
+ logger=logger, kind=config["llm_params"]["pdf_reader"]
381
+ )
382
  self.chunk_processor = ChunkProcessor(config, logger=logger)
383
 
384
  def get_chunks(self, uploaded_files, weblinks):
 
396
  with open("../code/modules/config/config.yml", "r") as f:
397
  config = yaml.safe_load(f)
398
 
399
+ STORAGE_DIR = os.path.join(BASE_DIR, config["vectorstore"]["data_path"])
400
  uploaded_files = [
401
+ os.path.join(STORAGE_DIR, file)
402
+ for file in os.listdir(STORAGE_DIR)
403
+ if file != "urls.txt"
404
  ]
405
 
406
  data_loader = DataLoader(config, logger=logger)
407
  document_chunks, document_names, documents, document_metadata = (
408
  data_loader.get_chunks(
409
+ [
410
+ "https://dl4ds.github.io/sp2024/static_files/discussion_slides/00_discussion.pdf"
411
+ ],
412
  [],
413
  )
414
  )
415
 
416
  print(document_names[:5])
417
  print(len(document_chunks))