|
#!/usr/bin/env python |
|
# -*- coding: utf-8 -*- |
|
|
|
import subprocess |
|
import sys |
|
from pathlib import Path |
|
import pickle |
|
from paperqa import Settings, Docs |
|
|
|
local_llm_config = { |
|
"model_list": [ |
|
{ |
|
"model_name": "ollama/llama3.1", |
|
"litellm_params": { |
|
"model": "ollama/llama3.1", |
|
}, |
|
}, |
|
] |
|
} |
|
local_emb_config = { |
|
"model_list": [ |
|
{ |
|
"model_name": "ollama/mxbai-embed-large", |
|
"litellm_params": { |
|
"model": "ollama/mxbai-embed-large", |
|
}, |
|
} |
|
] |
|
} |
|
|
|
settings = Settings( |
|
llm="ollama/llama3.1", |
|
llm_config=local_llm_config, |
|
summary_llm="ollama/llama3.1", |
|
summary_llm_config=local_llm_config, |
|
embedding="ollama/mxbai-embed-large", |
|
embedding_config=local_emb_config, |
|
) |
|
|
|
def find_main_tex_file(folder_path: Path): |
|
"" |
|
Find the main LaTeX file in the given folder. |
|
|
|
This function searches for a .tex file that is likely to be the main file |
|
of a LaTeX project. It first checks for common names like 'main.tex', |
|
then looks for files containing '\\documentclass', and finally returns |
|
the first .tex file if no other criteria are met. |
|
|
|
Args: |
|
folder_path (Path): The path to the folder to search in. |
|
|
|
Returns: |
|
Path: The path to the main .tex file, or None if no .tex files are found. |
|
"" |
|
tex_files = list(folder_path.glob('**/*.tex')) |
|
if not tex_files: |
|
return None |
|
|
|
# Check for common main file names |
|
common_names = ['main.tex', 'paper.tex', 'article.tex'] |
|
for name in common_names: |
|
if name in tex_files: |
|
return name |
|
|
|
# If no common name found, look for \documentclass |
|
for file in tex_files: |
|
with open(file, 'r', encoding='utf-8') as f: |
|
content = f.read() |
|
if '\\documentclass' in content: |
|
return file |
|
# If still not found, return the first .tex file |
|
return tex_files[0] |
|
|
|
def run_latexpand(input_file, output_file): |
|
"" |
|
Run the latexpand command on the input file and write the result to the output file. |
|
|
|
This function uses the latexpand tool to expand a LaTeX file, including all its |
|
inputs and packages, into a single file. The expanded content is then written |
|
to the specified output file. |
|
|
|
Args: |
|
input_file (str or Path): The path to the input LaTeX file. |
|
output_file (str or Path): The path where the expanded LaTeX content will be written. |
|
|
|
Raises: |
|
subprocess.CalledProcessError: If latexpand encounters an error during execution. |
|
FileNotFoundError: If the latexpand command is not found in the system PATH. |
|
"" |
|
try: |
|
result = subprocess.run(['latexpand', input_file], |
|
capture_output=True, text=True, check=True) |
|
with open(output_file, 'w', encoding='utf-8') as output_file_handle: |
|
output_file_handle.write(result.stdout) |
|
print(f"Expanded LaTeX written to {output_file}") |
|
except subprocess.CalledProcessError as e: |
|
print(f"Error running latexpand: {e}") |
|
except FileNotFoundError: |
|
print("latexpand not found. Please make sure it's installed and in your PATH.") |
|
|
|
|
|
cache_path = Path("pqa_index.pkl") |
|
|
|
if cache_path.exists(): |
|
with open(cache_path, "rb") as f: |
|
docs = pickle.load(f) |
|
else: |
|
docs = Docs() |
|
for root, dirs, files in Path(".").walk(): |
|
for dir_name in dirs: |
|
if dir_name.startswith("arXiv-"): |
|
dir_path = root / dir_name |
|
concat_main = dir_path / ".main.tex" |
|
try: |
|
# Step 1: Find the main entry TeX file |
|
main_file = find_main_tex_file(dir_path) |
|
if not main_file: |
|
raise ValueError("No main TeX file found.") |
|
# Step 2 & 3: Run latexpand and write output |
|
run_latexpand(main_file, dir_path / ".main.tex") |
|
except (ValueError, subprocess.CalledProcessError, |
|
FileNotFoundError) as preprocess_error: |
|
print(f"Failed to pre-process {dir_name}: {preprocess_error}") |
|
continue |
|
print(f"adding {dir_path} (latex source)") |
|
try: |
|
docs.add(concat_main, settings=settings, disable_check=True) |
|
except (IOError, OSError, ValueError) as add_error: |
|
print(f"Failed to add {dir_path}: {add_error}") |
|
continue |
|
dirs.remove(dir_name) |
|
break |
|
else: |
|
for file_name in files: |
|
if file_name.lower().endswith((".pdf", ".txt", ".md", ".tex")): |
|
file_path = root / file_name |
|
print(f"adding {file_path}") |
|
docs.add(file_path, settings=settings, disable_check=True) |
|
|
|
with open(cache_path, "wb") as f: |
|
pickle.dump(docs, f) |
|
|
|
|
|
if __name__ == "__main__": |
|
if len(sys.argv) > 1: |
|
QUERY = " ".join(sys.argv[1:]) |
|
answer = docs.query(QUERY, settings=settings) |
|
print(answer) |
|
else: |
|
print("Please provide a query as a command-line argument.") |
|
print("Usage: python script_name.py 'Your query here'") |
|
|