toolkit / papers_please
k4d3's picture
Initial commit
f1a2ec8
raw
history blame
5.31 kB
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import subprocess
import sys
from pathlib import Path
import pickle
from paperqa import Settings, Docs
local_llm_config = {
"model_list": [
{
"model_name": "ollama/llama3.1",
"litellm_params": {
"model": "ollama/llama3.1",
},
},
]
}
local_emb_config = {
"model_list": [
{
"model_name": "ollama/mxbai-embed-large",
"litellm_params": {
"model": "ollama/mxbai-embed-large",
},
}
]
}
settings = Settings(
llm="ollama/llama3.1",
llm_config=local_llm_config,
summary_llm="ollama/llama3.1",
summary_llm_config=local_llm_config,
embedding="ollama/mxbai-embed-large",
embedding_config=local_emb_config,
)
def find_main_tex_file(folder_path: Path):
"""
Find the main LaTeX file in the given folder.
This function searches for a .tex file that is likely to be the main file
of a LaTeX project. It first checks for common names like 'main.tex',
then looks for files containing '\\documentclass', and finally returns
the first .tex file if no other criteria are met.
Args:
folder_path (Path): The path to the folder to search in.
Returns:
Path: The path to the main .tex file, or None if no .tex files are found.
"""
tex_files = list(folder_path.glob('**/*.tex'))
if not tex_files:
return None
# Check for common main file names
common_names = ['main.tex', 'paper.tex', 'article.tex']
for name in common_names:
if name in tex_files:
return name
# If no common name found, look for \documentclass
for file in tex_files:
with open(file, 'r', encoding='utf-8') as f:
content = f.read()
if '\\documentclass' in content:
return file
# If still not found, return the first .tex file
return tex_files[0]
def run_latexpand(input_file, output_file):
"""
Run the latexpand command on the input file and write the result to the output file.
This function uses the latexpand tool to expand a LaTeX file, including all its
inputs and packages, into a single file. The expanded content is then written
to the specified output file.
Args:
input_file (str or Path): The path to the input LaTeX file.
output_file (str or Path): The path where the expanded LaTeX content will be written.
Raises:
subprocess.CalledProcessError: If latexpand encounters an error during execution.
FileNotFoundError: If the latexpand command is not found in the system PATH.
"""
try:
result = subprocess.run(['latexpand', input_file],
capture_output=True, text=True, check=True)
with open(output_file, 'w', encoding='utf-8') as output_file_handle:
output_file_handle.write(result.stdout)
print(f"Expanded LaTeX written to {output_file}")
except subprocess.CalledProcessError as e:
print(f"Error running latexpand: {e}")
except FileNotFoundError:
print("latexpand not found. Please make sure it's installed and in your PATH.")
cache_path = Path("pqa_index.pkl")
if cache_path.exists():
with open(cache_path, "rb") as f:
docs = pickle.load(f)
else:
docs = Docs()
for root, dirs, files in Path(".").walk():
for dir_name in dirs:
if dir_name.startswith("arXiv-"):
dir_path = root / dir_name
concat_main = dir_path / ".main.tex"
try:
# Step 1: Find the main entry TeX file
main_file = find_main_tex_file(dir_path)
if not main_file:
raise ValueError("No main TeX file found.")
# Step 2 & 3: Run latexpand and write output
run_latexpand(main_file, dir_path / ".main.tex")
except (ValueError, subprocess.CalledProcessError,
FileNotFoundError) as preprocess_error:
print(f"Failed to pre-process {dir_name}: {preprocess_error}")
continue
print(f"adding {dir_path} (latex source)")
try:
docs.add(concat_main, settings=settings, disable_check=True)
except (IOError, OSError, ValueError) as add_error:
print(f"Failed to add {dir_path}: {add_error}")
continue
dirs.remove(dir_name)
break
else:
for file_name in files:
if file_name.lower().endswith((".pdf", ".txt", ".md", ".tex")):
file_path = root / file_name
print(f"adding {file_path}")
docs.add(file_path, settings=settings, disable_check=True)
with open(cache_path, "wb") as f:
pickle.dump(docs, f)
if __name__ == "__main__":
if len(sys.argv) > 1:
QUERY = " ".join(sys.argv[1:])
answer = docs.query(QUERY, settings=settings)
print(answer)
else:
print("Please provide a query as a command-line argument.")
print("Usage: python script_name.py 'Your query here'")