Spaces:

Hjgugugjhuhjggg
/

Hhhgg

Build error

App Files Files Community

Hjgugugjhuhjggg commited on Nov 10, 2024

Commit

a6c0d65

verified ·

1 Parent(s): 544dfe8

Update app.py

Browse files

Files changed (1) hide show

app.py +220 -91

app.py CHANGED Viewed

@@ -1,96 +1,225 @@
 import os
-import hashlib
-import uvicorn
-from fastapi import FastAPI, Request
-from fastapi.responses import JSONResponse
-from langchain.llms import VLLM
-from gptcache import Cache
-from gptcache.manager.factory import manager_factory
-from gptcache.processor.pre import get_prompt
-from langchain_community.callbacks.manager import get_openai_callback
-from sklearn.metrics.pairwise import cosine_similarity
-from sentence_transformers import SentenceTransformer
 import torch
-import langchain
-import spaces
 app = FastAPI()
-def get_hashed_name(name):
-    return hashlib.sha256(name.encode()).hexdigest()
-def init_gptcache(cache_obj, llm):
-    hashed_llm = get_hashed_name(llm)
-    cache_obj.init(pre_embedding_func=get_prompt, data_manager=manager_factory(manager="map", data_dir=f"map_cache_{hashed_llm}"))
-cache = Cache()
-hf_token = os.environ.get("HF_TOKEN")
-llm_models = {
-    "4": VLLM(model="lilmeaty/4", trust_remote_code=True, use_cuda=False, max_new_tokens=50, temperature=0.1, use_auth_token=hf_token, device="cpu"),
-    "yi-coder": VLLM(model="01-ai/Yi-Coder-1.5B", trust_remote_code=True, use_cuda=False, max_new_tokens=50, temperature=0.6, use_auth_token=hf_token, device="cpu"),
-    "llama": VLLM(model="meta-llama/Llama-3.2-3B-Instruct", trust_remote_code=True, use_cuda=False, max_new_tokens=50, temperature=0.1, use_auth_token=hf_token, device="cpu"),
-    "qwen": VLLM(model="Qwen/Qwen2.5-1.5B-Instruct", trust_remote_code=True, use_cuda=False, max_new_tokens=50, temperature=0.6, use_auth_token=hf_token, device="cpu"),
-}
-for llm_name, llm in llm_models.items():
-    init_gptcache(cache, llm_name)
-langchain.llm_cache = langchain.cache.GPTCache(session=cache)
-try:
-    sentence_model = SentenceTransformer('all-mpnet-base-v2', device='cpu')
-except Exception as e:
-    print(f"Error loading SentenceTransformer: {e}")
-    sentence_model = None
-@app.get("/")
-def read_root():
-    return {"Hello": "World"}
-@app.post("/v1/generateText")
-@spaces.GPU()
-async def generateText(request: Request):
-    request_dict = await request.json()
-    prompt = request_dict.pop("prompt")
-    max_tokens = request_dict.get("max_tokens", -1)
-    all_responses = {}
-    for model_name, llm in llm_models.items():
-        try:
-            with get_openai_callback() as cb:
-                if max_tokens == -1:
-                    full_response = llm(prompt)
-                else:
-                    full_response = ""
-                    current_prompt = prompt
-                    while True:
-                        response_part = llm(current_prompt, max_new_tokens=max_tokens)
-                        full_response += response_part
-                        if len(full_response) >= max_tokens or response_part == "":
-                            break
-                        current_prompt = full_response
-                print(cb)
-                all_responses[model_name] = full_response
-                print(f"Model {model_name}: {full_response}")
-        except Exception as e:
-            print(f"Error with model {model_name}: {e}")
-    if not all_responses:
-        return JSONResponse({"error": "All models failed to generate text"}, status_code=500)
-    if sentence_model:
-        embeddings = sentence_model.encode(list(all_responses.values()))
-        similarities = cosine_similarity(embeddings)
-        avg_similarity = similarities.mean(axis=0)
-        best_model = list(all_responses.keys())[avg_similarity.argmax()]
-        best_response = all_responses[best_model]
-    else:
-        best_model = list(all_responses.keys())[0]
-        best_response = all_responses[best_model]
-    return JSONResponse({"best_model": best_model, "text": best_response, "all_responses": all_responses})
-if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860)

+import gc
+import psutil
 import os
+import time
 import torch
+from fastapi import FastAPI
+from vllm import VLLM
+from chatgptcache import cache
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import nltk
+from nltk.tokenize import sent_tokenize, word_tokenize
+from nltk.corpus import stopwords
+from collections import Counter
+import asyncio
+import torch.nn.utils.prune as prune
+from concurrent.futures import ThreadPoolExecutor
+nltk.download('punkt')
+nltk.download('stopwords')
 app = FastAPI()
+# Definir los modelos (serán cargados más tarde)
+model_1 = None
+model_2 = None
+model_3 = None
+model_4 = None
+cache_1 = cache.SimpleCache()
+cache_2 = cache.SimpleCache()
+cache_3 = cache.SimpleCache()
+cache_4 = cache.SimpleCache()
+previous_responses_1 = []
+previous_responses_2 = []
+previous_responses_3 = []
+previous_responses_4 = []
+MAX_TOKENS = 2048  # Máximo de tokens para entrada y salida del modelo
+# Usar ThreadPoolExecutor para ejecución en paralelo
+executor = ThreadPoolExecutor(max_workers=4)
+# Configuración del dispositivo (CPU)
+device = torch.device("cpu")
+def get_best_response(new_response, previous_responses):
+    if not previous_responses:
+        return new_response
+    vectorizer = TfidfVectorizer().fit_transform(previous_responses + [new_response])
+    cosine_sim = cosine_similarity(vectorizer[-1], vectorizer[:-1])
+    max_sim_index = cosine_sim.argmax()
+    max_sim_score = cosine_sim[0][max_sim_index]
+    if max_sim_score > 0.7:
+        return previous_responses[max_sim_index]
+    return new_response
+def summarize_text(text):
+    sentences = sent_tokenize(text)
+    stop_words = set(stopwords.words("english"))
+    word_frequencies = Counter()
+    for sentence in sentences:
+        words = word_tokenize(sentence.lower())
+        words = [word for word in words if word.isalpha() and word not in stop_words]
+        word_frequencies.update(words)
+    most_common_words = word_frequencies.most_common(50)
+    most_common_words = {word: freq for word, freq in most_common_words}
+    ranked_sentences = []
+    for sentence in sentences:
+        score = sum(most_common_words.get(word, 0) for word in word_tokenize(sentence.lower()))
+        ranked_sentences.append((score, sentence))
+    ranked_sentences.sort(reverse=True, key=lambda x: x[0])
+    summary = ' '.join([sentence for _, sentence in ranked_sentences[:3]])
+    return summary
+def clear_memory():
+    gc.collect()
+    process = psutil.Process(os.getpid())
+    memory_usage = psutil.virtual_memory().percent
+    if memory_usage > 90:
+        global model_1, model_2, model_3, model_4
+        model_1 = None
+        model_2 = None
+        model_3 = None
+        model_4 = None
+        gc.collect()
+def apply_pruning(model):
+    for name, module in model.named_modules():
+        if isinstance(module, torch.nn.Linear):
+            prune.random_unstructured(module, name="weight", amount=0.2)
+            prune.remove(module, name="weight")  # Opcional: Eliminar la máscara de poda para conservar los pesos podados
+    return model
+def split_input(input_text, max_tokens):
+    tokens = input_text.split()  # Dividir entrada en palabras (tokens)
+    chunks = []
+    chunk = []
+    total_tokens = 0
+    for word in tokens:
+        word_length = len(word.split())  # Estimar la longitud de los tokens
+        if total_tokens + word_length > max_tokens:
+            chunks.append(" ".join(chunk))
+            chunk = [word]
+            total_tokens = word_length
+        else:
+            chunk.append(word)
+            total_tokens += word_length
+    if chunk:
+        chunks.append(" ".join(chunk))  # Agregar el último fragmento
+    return chunks
+def split_output(output_text, max_tokens):
+    tokens = output_text.split()  # Dividir salida en palabras (tokens)
+    chunks = []
+    chunk = []
+    total_tokens = 0
+    for word in tokens:
+        word_length = len(word.split())  # Estimar la longitud de los tokens
+        if total_tokens + word_length > max_tokens:
+            chunks.append(" ".join(chunk))
+            chunk = [word]
+            total_tokens = word_length
+        else:
+            chunk.append(word)
+            total_tokens += word_length
+    if chunk:
+        chunks.append(" ".join(chunk))  # Agregar el último fragmento
+    return chunks
+async def load_model_async(model_name: str):
+    max_model_len = MAX_TOKENS  # Establecer la longitud máxima del modelo (tokens)
+    if model_name == "model_1":
+        return VLLM("Hjgugugjhuhjggg/llama-3.2-1B-spinquant-hf", device=device, max_model_len=max_model_len)
+    elif model_name == "model_2":
+        return VLLM("meta-llama/Llama-3.2-1B", device=device, max_model_len=max_model_len)
+    elif model_name == "model_3":
+        return VLLM("Qwen2.5-3B-Instruct", device=device, max_model_len=max_model_len)
+    elif model_name == "model_4":
+        return VLLM("gpt2", device=device, max_model_len=max_model_len)
+    return None
+async def load_models():
+    global model_1, model_2, model_3, model_4
+    tasks = [
+        load_model_async("model_1"),
+        load_model_async("model_2"),
+        load_model_async("model_3"),
+        load_model_async("model_4"),
+    ]
+    results = await asyncio.gather(*tasks)
+    model_1, model_2, model_3, model_4 = results
+    model_1 = apply_pruning(model_1)
+    model_2 = apply_pruning(model_2)
+    model_3 = apply_pruning(model_3)
+    model_4 = apply_pruning(model_4)
+    print("Modelos cargados y podados exitosamente.")
+async def optimize_models_periodically():
+    while True:
+        await load_models()  # Cargar y optimizar modelos automáticamente
+        await asyncio.sleep(3600)  # Optimizar modelos cada hora (ajustar intervalo según sea necesario)
+@app.on_event("startup")
+async def startup():
+    await load_models()
+    app.add_event_handler("startup", monitor_memory)
+    app.add_event_handler("startup", optimize_models_periodically)
+async def monitor_memory():
+    while True:
+        clear_memory()
+        await asyncio.sleep(60)
+@app.get("/generate")
+async def generate_response(model_name: str, input_text: str):
+    def generate_for_model(model, input_text, cache, previous_responses):
+        cached_output = cache.get(input_text)
+        if cached_output:
+            return cached_output
+        input_chunks = split_input(input_text, MAX_TOKENS)
+        output_text = ""
+        prev_output = ""
+        for chunk in input_chunks:
+            prompt = prev_output + chunk
+            output_text += model.generate(prompt)
+            prev_output = output_text.split()[-50:]
+        output_chunks = split_output(output_text, MAX_TOKENS)
+        best_response = get_best_response(output_chunks[0], previous_responses)
+        cache.put(input_text, best_response)
+        previous_responses.append(best_response)
+        return best_response
+    result = await asyncio.get_event_loop().run_in_executor(
+        executor,
+        generate_for_model,
+        model_1 if model_name == "model1" else model_2 if model_name == "model2" else model_3 if model_name == "model3" else model_4,
+        input_text,
+        cache_1 if model_name == "model1" else cache_2 if model_name == "model2" else cache_3 if model_name == "model3" else cache_4,
+        previous_responses_1 if model_name == "model1" else previous_responses_2 if model_name == "model2" else previous_responses_3 if model_name == "model3" else previous_responses_4
+    )
+    return {f"{model_name}_output": result}
+@app.get("/unified_summary")
+async def unified_summary(input_text: str):
+    output1 = await generate_response(model_name="model1", input_text=input_text)
+    output2 = await generate_response(model_name="model2", input_text=input_text)
+    output3 = await generate_response(model_name="model3", input_text=input_text)
+    output4 = await generate_response(model_name="model4", input_text=input_text)
+    combined_response = output1.get("model1_output", "") + " " + \
+                         output2.get("model2_output", "") + " " + \
+                         output3.get("model3_output", "") + " " + \
+                         output4.get("model4_output", "")
+    summarized_response = summarize_text(combined_response)
+    return {"summary": summarized_response}