Drag2121 commited on
Commit
55567e0
·
1 Parent(s): 533d2d0

Add application file

Browse files
Files changed (5) hide show
  1. Dockerfile +24 -0
  2. README.md +3 -3
  3. app.py +55 -0
  4. requirements.txt +5 -0
  5. start.sh +34 -0
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ # Install curl and Ollama
4
+ RUN apt-get update && apt-get install -y curl && \
5
+ curl -fsSL https://ollama.ai/install.sh | sh && \
6
+ apt-get clean && rm -rf /var/lib/apt/lists/*
7
+
8
+ # Set up user and environment
9
+ RUN useradd -m -u 1000 user
10
+ USER user
11
+ ENV HOME=/home/user \
12
+ PATH="/home/user/.local/bin:$PATH"
13
+
14
+ WORKDIR $HOME/app
15
+
16
+ COPY --chown=user requirements.txt .
17
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
18
+
19
+ COPY --chown=user . .
20
+
21
+ # Make the start script executable
22
+ RUN chmod +x start.sh
23
+
24
+ CMD ["./start.sh"]
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: TinyLlama
3
- emoji: 🐨
4
- colorFrom: yellow
5
- colorTo: gray
6
  sdk: docker
7
  pinned: false
8
  ---
 
1
  ---
2
  title: TinyLlama
3
+ emoji: 🐢
4
+ colorFrom: red
5
+ colorTo: green
6
  sdk: docker
7
  pinned: false
8
  ---
app.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from fastapi import FastAPI, HTTPException
3
+ from pydantic import BaseModel
4
+ from langchain_ollama import ChatOllama
5
+ from langchain.schema import StrOutputParser
6
+ from langchain.prompts import ChatPromptTemplate
7
+ import logging
8
+ from functools import lru_cache
9
+
10
+ # Set up logging
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+ app = FastAPI()
15
+
16
+ MODEL_NAME = 'tinyllama'
17
+
18
+ @lru_cache()
19
+ def get_llm():
20
+ return ChatOllama(model=MODEL_NAME)
21
+
22
+ @lru_cache()
23
+ def get_chain():
24
+ llm = get_llm()
25
+ prompt = ChatPromptTemplate.from_template("Question: {question}\n\nAnswer:")
26
+ return prompt | llm | StrOutputParser()
27
+
28
+ class Question(BaseModel):
29
+ text: str
30
+
31
+ @app.get("/")
32
+ def read_root():
33
+ return {"Hello": f"Welcome to {MODEL_NAME} FastAPI"}
34
+
35
+ @app.post("/ask")
36
+ async def ask_question(question: Question):
37
+ try:
38
+ logger.info(f"Received question: {question.text}")
39
+ chain = get_chain()
40
+ response = chain.invoke({"question": question.text})
41
+ logger.info("Response generated successfully")
42
+ return {"answer": response}
43
+ except Exception as e:
44
+ logger.error(f"Error in /ask endpoint: {str(e)}")
45
+ raise HTTPException(status_code=500, detail=str(e))
46
+
47
+ @app.on_event("startup")
48
+ async def startup_event():
49
+ logger.info(f"Starting up with model: {MODEL_NAME}")
50
+ # Warm up the cache
51
+ get_chain()
52
+
53
+ @app.on_event("shutdown")
54
+ async def shutdown_event():
55
+ logger.info("Shutting down")
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ langchain-ollama
4
+ langchain
5
+ pydantic
start.sh ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Set environment variables for optimization
4
+ export OMP_NUM_THREADS=4 # Optimize OpenMP threading
5
+ export MKL_NUM_THREADS=4 # Optimize Intel MKL threading
6
+ export CUDA_VISIBLE_DEVICES=0 # Use the first GPU if available
7
+
8
+ # Start Ollama in the background with optimized settings
9
+ ollama serve --gpu 0 & # Use GPU 0 if available
10
+
11
+ # Pull the model if not already present
12
+ if ! ollama list | grep -q "tinyllama"; then
13
+ ollama pull tinyllama
14
+ fi
15
+
16
+ # Wait for Ollama to start up (use a more robust check)
17
+ max_attempts=30
18
+ attempt=0
19
+ while ! curl -s http://localhost:11434/api/tags >/dev/null; do
20
+ sleep 1
21
+ attempt=$((attempt + 1))
22
+ if [ $attempt -eq $max_attempts ]; then
23
+ echo "Ollama failed to start within 30 seconds. Exiting."
24
+ exit 1
25
+ fi
26
+ done
27
+
28
+ echo "Ollama is ready."
29
+
30
+ # Print the API URL
31
+ echo "API is running on: http://0.0.0.0:7860"
32
+
33
+ # Start the FastAPI server with optimized settings
34
+ uvicorn app:app --host 0.0.0.0 --port 7860 --workers 4 --limit-concurrency 20