#!/bin/bash # Set environment variables for optimization export OMP_NUM_THREADS=4 # Optimize OpenMP threading export MKL_NUM_THREADS=4 # Optimize Intel MKL threading export CUDA_VISIBLE_DEVICES=0 # Use the first GPU if available # Start Ollama in the background with optimized settings ollama serve & # Use GPU 0 if available # Pull the model if not already present if ! ollama list | grep -q "llama3.2:1b"; then ollama pull llama3.2:1b fi # Wait for Ollama to start up (use a more robust check) max_attempts=30 attempt=0 while ! curl -s http://localhost:11434/api/tags >/dev/null; do sleep 1 attempt=$((attempt + 1)) if [ $attempt -eq $max_attempts ]; then echo "Ollama failed to start within 30 seconds. Exiting." exit 1 fi done echo "Ollama is ready." # Print the API URL echo "API is running on: http://0.0.0.0:7860" # Start the FastAPI server with optimized settings uvicorn app:app --host 0.0.0.0 --port 7860 --workers 4 --limit-concurrency 20