File size: 5,305 Bytes
f1a2ec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import subprocess
import sys
from pathlib import Path
import pickle
from paperqa import Settings, Docs

local_llm_config = {
    "model_list": [
        {
            "model_name": "ollama/llama3.1",
            "litellm_params": {
                "model": "ollama/llama3.1",
            },
        },
    ]
}
local_emb_config = {
    "model_list": [
        {
            "model_name": "ollama/mxbai-embed-large",
            "litellm_params": {
                "model": "ollama/mxbai-embed-large",
            },
        }
    ]
}

settings = Settings(
    llm="ollama/llama3.1",
    llm_config=local_llm_config,
    summary_llm="ollama/llama3.1",
    summary_llm_config=local_llm_config,
    embedding="ollama/mxbai-embed-large",
    embedding_config=local_emb_config,
)

def find_main_tex_file(folder_path: Path):
    """
    Find the main LaTeX file in the given folder.

    This function searches for a .tex file that is likely to be the main file
    of a LaTeX project. It first checks for common names like 'main.tex',
    then looks for files containing '\\documentclass', and finally returns
    the first .tex file if no other criteria are met.

    Args:
        folder_path (Path): The path to the folder to search in.

    Returns:
        Path: The path to the main .tex file, or None if no .tex files are found.
    """
    tex_files = list(folder_path.glob('**/*.tex'))
    if not tex_files:
        return None

    # Check for common main file names
    common_names = ['main.tex', 'paper.tex', 'article.tex']
    for name in common_names:
        if name in tex_files:
            return name

    # If no common name found, look for \documentclass
    for file in tex_files:
        with open(file, 'r', encoding='utf-8') as f:
            content = f.read()
            if '\\documentclass' in content:
                return file
    # If still not found, return the first .tex file
    return tex_files[0]

def run_latexpand(input_file, output_file):
    """
    Run the latexpand command on the input file and write the result to the output file.

    This function uses the latexpand tool to expand a LaTeX file, including all its
    inputs and packages, into a single file. The expanded content is then written
    to the specified output file.

    Args:
        input_file (str or Path): The path to the input LaTeX file.
        output_file (str or Path): The path where the expanded LaTeX content will be written.

    Raises:
        subprocess.CalledProcessError: If latexpand encounters an error during execution.
        FileNotFoundError: If the latexpand command is not found in the system PATH.
    """
    try:
        result = subprocess.run(['latexpand', input_file], 
                                capture_output=True, text=True, check=True)
        with open(output_file, 'w', encoding='utf-8') as output_file_handle:
            output_file_handle.write(result.stdout)
        print(f"Expanded LaTeX written to {output_file}")
    except subprocess.CalledProcessError as e:
        print(f"Error running latexpand: {e}")
    except FileNotFoundError:
        print("latexpand not found. Please make sure it's installed and in your PATH.")


cache_path = Path("pqa_index.pkl")

if cache_path.exists():
    with open(cache_path, "rb") as f:
        docs = pickle.load(f)
else:
    docs = Docs()
    for root, dirs, files in Path(".").walk():
        for dir_name in dirs:
            if dir_name.startswith("arXiv-"):
                dir_path = root / dir_name
                concat_main = dir_path / ".main.tex"
                try:
                    # Step 1: Find the main entry TeX file
                    main_file = find_main_tex_file(dir_path)
                    if not main_file:
                        raise ValueError("No main TeX file found.")
                    # Step 2 & 3: Run latexpand and write output
                    run_latexpand(main_file, dir_path / ".main.tex")
                except (ValueError, subprocess.CalledProcessError,
                        FileNotFoundError) as preprocess_error:
                    print(f"Failed to pre-process {dir_name}: {preprocess_error}")
                    continue
                print(f"adding {dir_path} (latex source)")
                try:
                    docs.add(concat_main, settings=settings, disable_check=True)
                except (IOError, OSError, ValueError) as add_error:
                    print(f"Failed to add {dir_path}: {add_error}")
                    continue
                dirs.remove(dir_name)
                break
        else:
            for file_name in files:
                if file_name.lower().endswith((".pdf", ".txt", ".md", ".tex")):
                    file_path = root / file_name
                    print(f"adding {file_path}")
                    docs.add(file_path, settings=settings, disable_check=True)

    with open(cache_path, "wb") as f:
        pickle.dump(docs, f)


if __name__ == "__main__":
    if len(sys.argv) > 1:
        QUERY = " ".join(sys.argv[1:])
        answer = docs.query(QUERY, settings=settings)
        print(answer)
    else:
        print("Please provide a query as a command-line argument.")
        print("Usage: python script_name.py 'Your query here'")