Spaces:

Dragneel
/

TinyLlama

Sleeping

App Files Files Community

Drag2121 commited on Sep 17, 2024

Commit

55567e0

1 Parent(s): 533d2d0

Add application file

Browse files

Files changed (5) hide show

Dockerfile +24 -0
README.md +3 -3
app.py +55 -0
requirements.txt +5 -0
start.sh +34 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,24 @@

+FROM python:3.9-slim
+# Install curl and Ollama
+RUN apt-get update && apt-get install -y curl && \
+    curl -fsSL https://ollama.ai/install.sh | sh && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+# Set up user and environment
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH="/home/user/.local/bin:$PATH"
+WORKDIR $HOME/app
+COPY --chown=user requirements.txt .
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . .
+# Make the start script executable
+RUN chmod +x start.sh
+CMD ["./start.sh"]

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 title: TinyLlama
-emoji: 🐨
-colorFrom: yellow
-colorTo: gray
 sdk: docker
 pinned: false
 ---

 ---
 title: TinyLlama
+emoji: 🐢
+colorFrom: red
+colorTo: green
 sdk: docker
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from langchain_ollama import ChatOllama
+from langchain.schema import StrOutputParser
+from langchain.prompts import ChatPromptTemplate
+import logging
+from functools import lru_cache
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = FastAPI()
+MODEL_NAME = 'tinyllama'
+@lru_cache()
+def get_llm():
+    return ChatOllama(model=MODEL_NAME)
+@lru_cache()
+def get_chain():
+    llm = get_llm()
+    prompt = ChatPromptTemplate.from_template("Question: {question}\n\nAnswer:")
+    return prompt | llm | StrOutputParser()
+class Question(BaseModel):
+    text: str
+@app.get("/")
+def read_root():
+    return {"Hello": f"Welcome to {MODEL_NAME} FastAPI"}
+@app.post("/ask")
+async def ask_question(question: Question):
+    try:
+        logger.info(f"Received question: {question.text}")
+        chain = get_chain()
+        response = chain.invoke({"question": question.text})
+        logger.info("Response generated successfully")
+        return {"answer": response}
+    except Exception as e:
+        logger.error(f"Error in /ask endpoint: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.on_event("startup")
+async def startup_event():
+    logger.info(f"Starting up with model: {MODEL_NAME}")
+    # Warm up the cache
+    get_chain()
+@app.on_event("shutdown")
+async def shutdown_event():
+    logger.info("Shutting down")

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+fastapi
+uvicorn[standard]
+langchain-ollama
+langchain
+pydantic

start.sh ADDED Viewed

	@@ -0,0 +1,34 @@

+#!/bin/bash
+# Set environment variables for optimization
+export OMP_NUM_THREADS=4  # Optimize OpenMP threading
+export MKL_NUM_THREADS=4  # Optimize Intel MKL threading
+export CUDA_VISIBLE_DEVICES=0  # Use the first GPU if available
+# Start Ollama in the background with optimized settings
+ollama serve --gpu 0 &  # Use GPU 0 if available
+# Pull the model if not already present
+if ! ollama list | grep -q "tinyllama"; then
+    ollama pull tinyllama
+fi
+# Wait for Ollama to start up (use a more robust check)
+max_attempts=30
+attempt=0
+while ! curl -s http://localhost:11434/api/tags >/dev/null; do
+    sleep 1
+    attempt=$((attempt + 1))
+    if [ $attempt -eq $max_attempts ]; then
+        echo "Ollama failed to start within 30 seconds. Exiting."
+        exit 1
+    fi
+done
+echo "Ollama is ready."
+# Print the API URL
+echo "API is running on: http://0.0.0.0:7860"
+# Start the FastAPI server with optimized settings
+uvicorn app:app --host 0.0.0.0 --port 7860 --workers 4 --limit-concurrency 20