Spaces:

AlignAI
/

Deep-Research-Arxiv

Running

File size: 8,562 Bytes

832926c

# research/arxiv_research.py
import asyncio
import aiohttp
import nest_asyncio
import xml.etree.ElementTree as ET  # For parsing Arxiv XML response
nest_asyncio.apply()

# API Endpoints
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
ARXIV_API_URL = "http://export.arxiv.org/api/query"

# Global API Key (You'll set this in app.py)
OPENROUTER_API_KEY = ""
DEFAULT_MODEL = "google/gemini-2.0-flash-lite-preview-02-05:free"

FIXED_PAPER_COUNT = 70  
async def call_openrouter_async(session, messages, model=DEFAULT_MODEL):
    """
    Make an asynchronous request to the OpenRouter chat completion API.
    Returns the assistant's reply text.
    """
    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "HTTP-Referer": "https://github.com/Pygen",
        "X-Title": "Arxiv Literature Review Assistant",
        "Content-Type": "application/json"
    }

    payload = {
        "model": model,
        "messages": messages,
        "temperature": 0.7,
        "max_tokens": 4096
    }

    try:
        async with session.post(OPENROUTER_URL, headers=headers, json=payload) as resp:
            if resp.status == 200:
                result = await resp.json()
                return result['choices'][0]['message']['content']
            else:
                text = await resp.text()
                print(f"OpenRouter API error: {resp.status} - {text}")
                return None
    except Exception as e:
        print("Error during OpenRouter call:", e)
        return None

async def search_arxiv_async(session, query, max_results=100):
    """
    Search Arxiv API (no API key needed) and return paper entries.
    """
    params = {
        'search_query': query,
        'start': 0,
        'max_results': max_results,
        'sortBy': 'relevance',
        'sortOrder': 'descending'
    }
    paper_entries = []
    try:
        async with session.get(ARXIV_API_URL, params=params) as response:
            if response.status == 200:
                xml_content = await response.text()
                root = ET.fromstring(xml_content)
                namespace = {'atom': 'http://www.w3.org/2005/Atom'}

                entries = root.findall('atom:entry', namespace)
                for entry in entries:
                    title_element = entry.find('atom:title', namespace)
                    abstract_element = entry.find('atom:summary', namespace)
                    url_element = entry.find('atom:id', namespace)
                    authors_elements = entry.findall('atom:author/atom:name', namespace)
                    published_element = entry.find('atom:published', namespace)  # Get publication date

                    authors = [author.text for author in authors_elements] if authors_elements else ["N/A"]
                    title = title_element.text.strip() if title_element is not None else "N/A"
                    abstract = abstract_element.text.strip().replace('\n', ' ') if abstract_element is not None else "N/A"
                    url = url_element.text.strip() if url_element is not None else "N/A"
                    published = published_element.text.strip() if published_element is not None else "N/A"
                    year = published[:4] if published else "N/A" #Extract the year.

                    paper_entries.append({
                        'title': title,
                        'abstract': abstract,
                        'url': url,
                        'authors': ', '.join(authors),
                        'year': year
                    })
            else:
                print(f"Arxiv API error: {response.status}")
                return []
    except Exception as e:
        print(f"Error during Arxiv API call: {e}")
        return []
    return paper_entries

async def prepare_references(paper_entries):
    """Prepare reference list from paper entries"""
    references = []
    for idx, paper in enumerate(paper_entries, 1):
        references.append({
            'citation_number': idx,
            'authors': paper['authors'],
            'title': paper['title'],
            'year': paper['year'],
            'url': paper['url'],
            'abstract': paper['abstract'],
            'citation_key': f"[{idx}]"
        })
    return references

async def generate_bibtex_entry(ref):
    """Generate BibTeX entry for a paper."""
    arxiv_id = ref['url'].split('/')[-1]
    bibtex = (
        f"@article{{{arxiv_id},\n"
        f"  author = {{{ref['authors']}}},\n"
        f"  title = {{{ref['title']}}},\n"
        f"  year = {{{ref['year']}}},\n"
        f"  eprint = {{{arxiv_id}}},\n"
        f"  archivePrefix = {{arXiv}},\n"
        f"  primaryClass = {{cs.LG}},\n"  # You might want to make this dynamic
        f"  url = {{{ref['url']}}}\n"
        f"}}\n\n"  # Added an extra newline after the BibTeX entry
    )
    return bibtex

async def generate_literature_review_async(session, user_query, paper_entries):
    """
    Generate literature review based on prepared references.
    """
    # First prepare all references
    references = await prepare_references(paper_entries)
    
    # Prepare paper information with citations
    papers_info = []
    for ref in references:
        papers_info.append(
            f"Paper {ref['citation_key']}:\n"
            f"Title: {ref['title']}\n"
            f"Abstract: {ref['abstract']}\n"
            f"Citation: Use {ref['citation_key']} to cite this paper"
        )

    # Generate Nature-style review
    review_prompt = (
        "Write a comprehensive literature review in Nature journal style. "
        "Requirements:\n"
        "1. Use formal Nature journal style\n"
        "2. Begin with a compelling introduction\n"
        "3. Organize findings into clear themes\n"
        "4. Use provided citation numbers [n] when discussing papers\n"
        "5. Each paper must be cited at least once\n"
        "6. Make connections between related papers\n"
        "7. Conclude with future directions\n"
        "7. Make sure the literature review is at least 6000 words if the {paper_count} are more than 70, and at least 4000 words when the {paper_count} are 40 to 70, andat least 2500 words when the {paper_count} are 10 to 39.\n"
        "8. DO NOT include references - they will be added separately\n"
        f"\nTopic: {user_query}\n\n"
        f"Available Papers:\n\n{chr(10).join(papers_info)}"
    )

    messages = [
        {"role": "system", "content": "You are a Nature journal editor writing a literature review."},
        {"role": "user", "content": review_prompt}
    ]
    
    literature_review = await call_openrouter_async(session, messages)

    if literature_review:
        # Format references in Nature style with clickable links
        refs_section = "\nReferences\n"
        bibtex_section = "\nBibTeX Citations:\n\n"  # New section for BibTeX

        for ref in references:
            arxiv_id = ref['url'].split('/')[-1]
            refs_section += (
                f"{ref['citation_number']}. {ref['authors']}. "
                f"{ref['title']}. "
                f"arXiv:{arxiv_id} ({ref['year']}). "
                f"Available at: {ref['url']}\n"
            )
            bibtex_section += await generate_bibtex_entry(ref)  # Generate BibTeX entry

        # Add section separator
        final_text = (
            literature_review +
            "\n" + "="*50 + "\n" +
            refs_section +
            "\n" + "="*50 + "\n" +  # Separator for BibTeX
            bibtex_section
        )

        return final_text

    return "Error generating literature review."

async def research_flow(user_query, paper_count):
    """
    Execute research flow with user-specified paper count.
    """
    async with aiohttp.ClientSession() as session:
        # Step 1: Get exact number of papers requested
        paper_entries = await search_arxiv_async(session, user_query, max_results=paper_count)
        
        if not paper_entries:
            return "No relevant papers found. Please try a different query."
        
        # Step 2: Generate review with prepared references
        literature_review = await generate_literature_review_async(session, user_query, paper_entries[:paper_count])
        return literature_review

# def main():
#     """CLI entry point."""
#     user_query = input("Enter your research topic/question: ").strip()
#     final_report = asyncio.run(research_flow(user_query))
#     print("\n==== LITERATURE REVIEW ====\n")
#     print(final_report)

# if __name__ == "__main__":
#     main()