Spaces:

AlignAI
/

Deep-Research-Arxiv

Running

Deep-Research-Arxiv / research /arxiv_research.py

GitsSaikat

first commit

832926c 25 days ago

8.56 kB

	# research/arxiv_research.py
	import asyncio
	import aiohttp
	import nest_asyncio
	import xml.etree.ElementTree as ET # For parsing Arxiv XML response
	nest_asyncio.apply()

	# API Endpoints
	OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
	ARXIV_API_URL = "http://export.arxiv.org/api/query"

	# Global API Key (You'll set this in app.py)
	OPENROUTER_API_KEY = ""
	DEFAULT_MODEL = "google/gemini-2.0-flash-lite-preview-02-05:free"

	FIXED_PAPER_COUNT = 70
	async def call_openrouter_async(session, messages, model=DEFAULT_MODEL):
	"""
	Make an asynchronous request to the OpenRouter chat completion API.
	Returns the assistant's reply text.
	"""
	headers = {
	"Authorization": f"Bearer {OPENROUTER_API_KEY}",
	"HTTP-Referer": "https://github.com/Pygen",
	"X-Title": "Arxiv Literature Review Assistant",
	"Content-Type": "application/json"
	}

	payload = {
	"model": model,
	"messages": messages,
	"temperature": 0.7,
	"max_tokens": 4096
	}

	try:
	async with session.post(OPENROUTER_URL, headers=headers, json=payload) as resp:
	if resp.status == 200:
	result = await resp.json()
	return result['choices'][0]['message']['content']
	else:
	text = await resp.text()
	print(f"OpenRouter API error: {resp.status} - {text}")
	return None
	except Exception as e:
	print("Error during OpenRouter call:", e)
	return None

	async def search_arxiv_async(session, query, max_results=100):
	"""
	Search Arxiv API (no API key needed) and return paper entries.
	"""
	params = {
	'search_query': query,
	'start': 0,
	'max_results': max_results,
	'sortBy': 'relevance',
	'sortOrder': 'descending'
	}
	paper_entries = []
	try:
	async with session.get(ARXIV_API_URL, params=params) as response:
	if response.status == 200:
	xml_content = await response.text()
	root = ET.fromstring(xml_content)
	namespace = {'atom': 'http://www.w3.org/2005/Atom'}

	entries = root.findall('atom:entry', namespace)
	for entry in entries:
	title_element = entry.find('atom:title', namespace)
	abstract_element = entry.find('atom:summary', namespace)
	url_element = entry.find('atom:id', namespace)
	authors_elements = entry.findall('atom:author/atom:name', namespace)
	published_element = entry.find('atom:published', namespace) # Get publication date

	authors = [author.text for author in authors_elements] if authors_elements else ["N/A"]
	title = title_element.text.strip() if title_element is not None else "N/A"
	abstract = abstract_element.text.strip().replace('\n', ' ') if abstract_element is not None else "N/A"
	url = url_element.text.strip() if url_element is not None else "N/A"
	published = published_element.text.strip() if published_element is not None else "N/A"
	year = published[:4] if published else "N/A" #Extract the year.

	paper_entries.append({
	'title': title,
	'abstract': abstract,
	'url': url,
	'authors': ', '.join(authors),
	'year': year
	})
	else:
	print(f"Arxiv API error: {response.status}")
	return []
	except Exception as e:
	print(f"Error during Arxiv API call: {e}")
	return []
	return paper_entries

	async def prepare_references(paper_entries):
	"""Prepare reference list from paper entries"""
	references = []
	for idx, paper in enumerate(paper_entries, 1):
	references.append({
	'citation_number': idx,
	'authors': paper['authors'],
	'title': paper['title'],
	'year': paper['year'],
	'url': paper['url'],
	'abstract': paper['abstract'],
	'citation_key': f"[{idx}]"
	})
	return references

	async def generate_bibtex_entry(ref):
	"""Generate BibTeX entry for a paper."""
	arxiv_id = ref['url'].split('/')[-1]
	bibtex = (
	f"@article{{{arxiv_id},\n"
	f" author = {{{ref['authors']}}},\n"
	f" title = {{{ref['title']}}},\n"
	f" year = {{{ref['year']}}},\n"
	f" eprint = {{{arxiv_id}}},\n"
	f" archivePrefix = {{arXiv}},\n"
	f" primaryClass = {{cs.LG}},\n" # You might want to make this dynamic
	f" url = {{{ref['url']}}}\n"
	f"}}\n\n" # Added an extra newline after the BibTeX entry
	)
	return bibtex

	async def generate_literature_review_async(session, user_query, paper_entries):
	"""
	Generate literature review based on prepared references.
	"""
	# First prepare all references
	references = await prepare_references(paper_entries)

	# Prepare paper information with citations
	papers_info = []
	for ref in references:
	papers_info.append(
	f"Paper {ref['citation_key']}:\n"
	f"Title: {ref['title']}\n"
	f"Abstract: {ref['abstract']}\n"
	f"Citation: Use {ref['citation_key']} to cite this paper"
	)

	# Generate Nature-style review
	review_prompt = (
	"Write a comprehensive literature review in Nature journal style. "
	"Requirements:\n"
	"1. Use formal Nature journal style\n"
	"2. Begin with a compelling introduction\n"
	"3. Organize findings into clear themes\n"
	"4. Use provided citation numbers [n] when discussing papers\n"
	"5. Each paper must be cited at least once\n"
	"6. Make connections between related papers\n"
	"7. Conclude with future directions\n"
	"7. Make sure the literature review is at least 6000 words if the {paper_count} are more than 70, and at least 4000 words when the {paper_count} are 40 to 70, andat least 2500 words when the {paper_count} are 10 to 39.\n"
	"8. DO NOT include references - they will be added separately\n"
	f"\nTopic: {user_query}\n\n"
	f"Available Papers:\n\n{chr(10).join(papers_info)}"
	)

	messages = [
	{"role": "system", "content": "You are a Nature journal editor writing a literature review."},
	{"role": "user", "content": review_prompt}
	]

	literature_review = await call_openrouter_async(session, messages)

	if literature_review:
	# Format references in Nature style with clickable links
	refs_section = "\nReferences\n"
	bibtex_section = "\nBibTeX Citations:\n\n" # New section for BibTeX

	for ref in references:
	arxiv_id = ref['url'].split('/')[-1]
	refs_section += (
	f"{ref['citation_number']}. {ref['authors']}. "
	f"{ref['title']}. "
	f"arXiv:{arxiv_id} ({ref['year']}). "
	f"Available at: {ref['url']}\n"
	)
	bibtex_section += await generate_bibtex_entry(ref) # Generate BibTeX entry

	# Add section separator
	final_text = (
	literature_review +
	"\n" + "="*50 + "\n" +
	refs_section +
	"\n" + "="*50 + "\n" + # Separator for BibTeX
	bibtex_section
	)

	return final_text

	return "Error generating literature review."

	async def research_flow(user_query, paper_count):
	"""
	Execute research flow with user-specified paper count.
	"""
	async with aiohttp.ClientSession() as session:
	# Step 1: Get exact number of papers requested
	paper_entries = await search_arxiv_async(session, user_query, max_results=paper_count)

	if not paper_entries:
	return "No relevant papers found. Please try a different query."

	# Step 2: Generate review with prepared references
	literature_review = await generate_literature_review_async(session, user_query, paper_entries[:paper_count])
	return literature_review

	# def main():
	# """CLI entry point."""
	# user_query = input("Enter your research topic/question: ").strip()
	# final_report = asyncio.run(research_flow(user_query))
	# print("\n==== LITERATURE REVIEW ====\n")
	# print(final_report)

	# if __name__ == "__main__":
	# main()