Spaces:
Running
Running
# research/arxiv_research.py | |
import asyncio | |
import aiohttp | |
import nest_asyncio | |
import xml.etree.ElementTree as ET # For parsing Arxiv XML response | |
nest_asyncio.apply() | |
# API Endpoints | |
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions" | |
ARXIV_API_URL = "http://export.arxiv.org/api/query" | |
# Global API Key (You'll set this in app.py) | |
OPENROUTER_API_KEY = "" | |
DEFAULT_MODEL = "google/gemini-2.0-flash-lite-preview-02-05:free" | |
FIXED_PAPER_COUNT = 70 | |
async def call_openrouter_async(session, messages, model=DEFAULT_MODEL): | |
""" | |
Make an asynchronous request to the OpenRouter chat completion API. | |
Returns the assistant's reply text. | |
""" | |
headers = { | |
"Authorization": f"Bearer {OPENROUTER_API_KEY}", | |
"HTTP-Referer": "https://github.com/Pygen", | |
"X-Title": "Arxiv Literature Review Assistant", | |
"Content-Type": "application/json" | |
} | |
payload = { | |
"model": model, | |
"messages": messages, | |
"temperature": 0.7, | |
"max_tokens": 4096 | |
} | |
try: | |
async with session.post(OPENROUTER_URL, headers=headers, json=payload) as resp: | |
if resp.status == 200: | |
result = await resp.json() | |
return result['choices'][0]['message']['content'] | |
else: | |
text = await resp.text() | |
print(f"OpenRouter API error: {resp.status} - {text}") | |
return None | |
except Exception as e: | |
print("Error during OpenRouter call:", e) | |
return None | |
async def search_arxiv_async(session, query, max_results=100): | |
""" | |
Search Arxiv API (no API key needed) and return paper entries. | |
""" | |
params = { | |
'search_query': query, | |
'start': 0, | |
'max_results': max_results, | |
'sortBy': 'relevance', | |
'sortOrder': 'descending' | |
} | |
paper_entries = [] | |
try: | |
async with session.get(ARXIV_API_URL, params=params) as response: | |
if response.status == 200: | |
xml_content = await response.text() | |
root = ET.fromstring(xml_content) | |
namespace = {'atom': 'http://www.w3.org/2005/Atom'} | |
entries = root.findall('atom:entry', namespace) | |
for entry in entries: | |
title_element = entry.find('atom:title', namespace) | |
abstract_element = entry.find('atom:summary', namespace) | |
url_element = entry.find('atom:id', namespace) | |
authors_elements = entry.findall('atom:author/atom:name', namespace) | |
published_element = entry.find('atom:published', namespace) # Get publication date | |
authors = [author.text for author in authors_elements] if authors_elements else ["N/A"] | |
title = title_element.text.strip() if title_element is not None else "N/A" | |
abstract = abstract_element.text.strip().replace('\n', ' ') if abstract_element is not None else "N/A" | |
url = url_element.text.strip() if url_element is not None else "N/A" | |
published = published_element.text.strip() if published_element is not None else "N/A" | |
year = published[:4] if published else "N/A" #Extract the year. | |
paper_entries.append({ | |
'title': title, | |
'abstract': abstract, | |
'url': url, | |
'authors': ', '.join(authors), | |
'year': year | |
}) | |
else: | |
print(f"Arxiv API error: {response.status}") | |
return [] | |
except Exception as e: | |
print(f"Error during Arxiv API call: {e}") | |
return [] | |
return paper_entries | |
async def prepare_references(paper_entries): | |
"""Prepare reference list from paper entries""" | |
references = [] | |
for idx, paper in enumerate(paper_entries, 1): | |
references.append({ | |
'citation_number': idx, | |
'authors': paper['authors'], | |
'title': paper['title'], | |
'year': paper['year'], | |
'url': paper['url'], | |
'abstract': paper['abstract'], | |
'citation_key': f"[{idx}]" | |
}) | |
return references | |
async def generate_bibtex_entry(ref): | |
"""Generate BibTeX entry for a paper.""" | |
arxiv_id = ref['url'].split('/')[-1] | |
bibtex = ( | |
f"@article{{{arxiv_id},\n" | |
f" author = {{{ref['authors']}}},\n" | |
f" title = {{{ref['title']}}},\n" | |
f" year = {{{ref['year']}}},\n" | |
f" eprint = {{{arxiv_id}}},\n" | |
f" archivePrefix = {{arXiv}},\n" | |
f" primaryClass = {{cs.LG}},\n" # You might want to make this dynamic | |
f" url = {{{ref['url']}}}\n" | |
f"}}\n\n" # Added an extra newline after the BibTeX entry | |
) | |
return bibtex | |
async def generate_literature_review_async(session, user_query, paper_entries): | |
""" | |
Generate literature review based on prepared references. | |
""" | |
# First prepare all references | |
references = await prepare_references(paper_entries) | |
# Prepare paper information with citations | |
papers_info = [] | |
for ref in references: | |
papers_info.append( | |
f"Paper {ref['citation_key']}:\n" | |
f"Title: {ref['title']}\n" | |
f"Abstract: {ref['abstract']}\n" | |
f"Citation: Use {ref['citation_key']} to cite this paper" | |
) | |
# Generate Nature-style review | |
review_prompt = ( | |
"Write a comprehensive literature review in Nature journal style. " | |
"Requirements:\n" | |
"1. Use formal Nature journal style\n" | |
"2. Begin with a compelling introduction\n" | |
"3. Organize findings into clear themes\n" | |
"4. Use provided citation numbers [n] when discussing papers\n" | |
"5. Each paper must be cited at least once\n" | |
"6. Make connections between related papers\n" | |
"7. Conclude with future directions\n" | |
"7. Make sure the literature review is at least 6000 words if the {paper_count} are more than 70, and at least 4000 words when the {paper_count} are 40 to 70, andat least 2500 words when the {paper_count} are 10 to 39.\n" | |
"8. DO NOT include references - they will be added separately\n" | |
f"\nTopic: {user_query}\n\n" | |
f"Available Papers:\n\n{chr(10).join(papers_info)}" | |
) | |
messages = [ | |
{"role": "system", "content": "You are a Nature journal editor writing a literature review."}, | |
{"role": "user", "content": review_prompt} | |
] | |
literature_review = await call_openrouter_async(session, messages) | |
if literature_review: | |
# Format references in Nature style with clickable links | |
refs_section = "\nReferences\n" | |
bibtex_section = "\nBibTeX Citations:\n\n" # New section for BibTeX | |
for ref in references: | |
arxiv_id = ref['url'].split('/')[-1] | |
refs_section += ( | |
f"{ref['citation_number']}. {ref['authors']}. " | |
f"{ref['title']}. " | |
f"arXiv:{arxiv_id} ({ref['year']}). " | |
f"Available at: {ref['url']}\n" | |
) | |
bibtex_section += await generate_bibtex_entry(ref) # Generate BibTeX entry | |
# Add section separator | |
final_text = ( | |
literature_review + | |
"\n" + "="*50 + "\n" + | |
refs_section + | |
"\n" + "="*50 + "\n" + # Separator for BibTeX | |
bibtex_section | |
) | |
return final_text | |
return "Error generating literature review." | |
async def research_flow(user_query, paper_count): | |
""" | |
Execute research flow with user-specified paper count. | |
""" | |
async with aiohttp.ClientSession() as session: | |
# Step 1: Get exact number of papers requested | |
paper_entries = await search_arxiv_async(session, user_query, max_results=paper_count) | |
if not paper_entries: | |
return "No relevant papers found. Please try a different query." | |
# Step 2: Generate review with prepared references | |
literature_review = await generate_literature_review_async(session, user_query, paper_entries[:paper_count]) | |
return literature_review | |
# def main(): | |
# """CLI entry point.""" | |
# user_query = input("Enter your research topic/question: ").strip() | |
# final_report = asyncio.run(research_flow(user_query)) | |
# print("\n==== LITERATURE REVIEW ====\n") | |
# print(final_report) | |
# if __name__ == "__main__": | |
# main() |