writing-prototypes

Running

App Files Files Community

kcarnold commited on about 19 hours ago

Commit

9b8968e

1 Parent(s): 898f051

sync up with the backend

Browse files

Files changed (5) hide show

custom_llm.py +43 -5
custom_llm_inference.py +70 -1
pyproject.toml +15 -1
test_llm_inference.py +65 -0
uv.lock +0 -0

custom_llm.py CHANGED Viewed

@@ -5,6 +5,7 @@ from contextlib import asynccontextmanager
 from pathlib import Path
 from typing import Dict, List, Optional
 import torch
 import uvicorn
 from fastapi import FastAPI, HTTPException
@@ -12,7 +13,7 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.testclient import TestClient
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from custom_llm_inference import get_highlights_inner, get_next_token_predictions_inner
 ml_models = {}
@@ -36,7 +37,12 @@ async def models_lifespan(app: FastAPI):
     ml_models["llm"] = llm = {
         'tokenizer': AutoTokenizer.from_pretrained(model_name),
-        'model': AutoModelForCausalLM.from_pretrained(model_name, device_map="auto" if USE_GPU else "cpu", torch_dtype=dtype)
     }
     print("Loaded llm with device map:")
     print(llm['model'].hf_device_map)
@@ -61,7 +67,7 @@ async def models_lifespan(app: FastAPI):
     start = time.time()
     response = client.get("/api/gen_revisions",
-        params={"doc": test_doc, "prompt": test_prompt, "n": 1})
     print(f"Gen revisions endpoint: {time.time() - start:.2f}s")
     yield
@@ -132,7 +138,9 @@ def get_next_token_predictions(original_doc: str,
 def gen_revisions(
         prompt: str,
         doc: str,
-        n: Optional[int] = 5):
     model = ml_models['llm']['model']
@@ -148,7 +156,7 @@ def gen_revisions(
     generations = model.generate(
         tokenized_chat, num_return_sequences=n,
-        max_length=1024, do_sample=True, top_k=50, top_p=0.95, temperature=0.5,
         return_dict_in_generate=True, output_scores=True)
     generated_docs = tokenizer.batch_decode(generations.sequences, skip_special_tokens=True)
     #print(generations.scores)
@@ -166,5 +174,35 @@ def gen_revisions(
     }
 if __name__ == "__main__":
     uvicorn.run(app, host="localhost", port=PORT)

 from pathlib import Path
 from typing import Dict, List, Optional
+from pydantic import BaseModel
 import torch
 import uvicorn
 from fastapi import FastAPI, HTTPException
 from fastapi.testclient import TestClient
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from custom_llm_inference import get_highlights_inner, get_next_token_predictions_inner, continue_messages_inner
 ml_models = {}
     ml_models["llm"] = llm = {
         'tokenizer': AutoTokenizer.from_pretrained(model_name),
+        'model': AutoModelForCausalLM.from_pretrained(
+            model_name,
+            device_map="auto" if USE_GPU else "cpu",
+            torch_dtype=dtype,
+            attn_implementation='eager'
+        )
     }
     print("Loaded llm with device map:")
     print(llm['model'].hf_device_map)
     start = time.time()
     response = client.get("/api/gen_revisions",
+        params={"doc": test_doc, "prompt": test_prompt, "n": 1, "max_length": 16})
     print(f"Gen revisions endpoint: {time.time() - start:.2f}s")
     yield
 def gen_revisions(
         prompt: str,
         doc: str,
+        n: Optional[int] = 5,
+        max_length: Optional[int] = 1024,
+        ):
     model = ml_models['llm']['model']
     generations = model.generate(
         tokenized_chat, num_return_sequences=n,
+        max_new_tokens=max_length, do_sample=True, top_k=50, top_p=0.95, temperature=0.5,
         return_dict_in_generate=True, output_scores=True)
     generated_docs = tokenizer.batch_decode(generations.sequences, skip_special_tokens=True)
     #print(generations.scores)
     }
+class Message(BaseModel):
+    role: str
+    content: str
+class ContinueMessagesRequest(BaseModel):
+    messages: List[Message]
+    n_branch_tokens: int = 5
+    n_future_tokens: int = 5
+@app.post('/api/continue_messages')
+def continue_messages(request: ContinueMessagesRequest):
+    messages = [{"role": m.role, "content": m.content} for m in request.messages]
+    if len(messages) == 0:
+        raise HTTPException(status_code=400, detail="At least one message must be provided.")
+    n_branch_tokens = request.n_branch_tokens
+    n_future_tokens = request.n_future_tokens
+    model = ml_models['llm']['model']
+    tokenizer = ml_models['llm']['tokenizer']
+    generated_docs = continue_messages_inner(model, tokenizer, messages, n_branch_tokens, n_future_tokens)
+    return {
+        'continuations': [dict(doc_text=doc) for doc in generated_docs]
+    }
 if __name__ == "__main__":
     uvicorn.run(app, host="localhost", port=PORT)

custom_llm_inference.py CHANGED Viewed

@@ -37,7 +37,8 @@ def get_highlights_inner(model, tokenizer, doc, prompt, updated_doc, k):
     updated_doc_ids = tokenize_doc_in_progress(tokenizer, updated_doc)
     joined_ids = torch.cat([tokenized_chat, updated_doc_ids])
-    # Call the model
     with torch.no_grad():
         logits = model(joined_ids[None].to(model.device)).logits[0].cpu()
@@ -191,3 +192,71 @@ def get_next_token_predictions_slow(
     decoded_next_tokens = tokenizer.batch_decode(lookahead_sequences, skip_special_tokens=True)
     return decoded_next_tokens, next_token_logits

     updated_doc_ids = tokenize_doc_in_progress(tokenizer, updated_doc)
     joined_ids = torch.cat([tokenized_chat, updated_doc_ids])
+    # Compute the next-token logits for the entire document
     with torch.no_grad():
         logits = model(joined_ids[None].to(model.device)).logits[0].cpu()
     decoded_next_tokens = tokenizer.batch_decode(lookahead_sequences, skip_special_tokens=True)
     return decoded_next_tokens, next_token_logits
+def continue_messages_inner(model, tokenizer, messages, n_branch_tokens, n_future_tokens):
+    device = model.device
+    final_message_is_assistant = messages[-1]['role'] == "assistant"
+    print(f"final_message_is_assistant: {final_message_is_assistant}")
+    # if final_message_is_assistant:
+    #     tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, continue_final_message=True, return_tensors="pt").to(model.device)
+    # else:
+    #     tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)
+    tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, return_tensors="pt", continue_final_message=True).to(model.device)
+    print(tokenizer.batch_decode(tokenized_chat, skip_special_tokens=False))
+    # This fails with
+    # RuntimeError: Index put requires the source and destination dtypes match, got BFloat16 for the destination and Float for the source.
+    # generations = model.generate(
+    #     tokenized_chat,
+    #     num_return_sequences=n_branch_tokens,
+    #     num_beam_groups=n_branch_tokens, num_beams=n_branch_tokens,
+    #     do_sample=False, max_new_tokens=n_future_tokens, diversity_penalty=1e5, top_k=None,
+    #     return_dict_in_generate=True, output_scores=True)
+    # Instead, we'll do this in two steps:
+    # 1. Get the next token predictions for the k most likely continuations
+    from transformers.cache_utils import DynamicCache
+    past_key_values = DynamicCache()
+    with torch.no_grad():
+        model_outs = model(
+            tokenized_chat,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+            use_cache=True,
+        )
+        branch_tokens = model_outs.logits[0, -1].topk(n_branch_tokens).indices
+    hypotheses = branch_tokens.unsqueeze(1)
+    # Branch off the k most likely continuations
+    past_key_values.reorder_cache(torch.zeros((n_branch_tokens,), dtype=torch.long, device=device))
+    # 2. Generate the next n_future_tokens for each branch
+    for i in range(n_future_tokens):
+        position_id_for_final_token = tokenized_chat.shape[0] + i
+        cache_position = torch.full((1,), position_id_for_final_token, dtype=int, device=device)
+        final_token_ids = hypotheses[:, -1:]
+        with torch.no_grad():
+            model_outs = model(
+                final_token_ids,
+                past_key_values=past_key_values,
+                output_hidden_states=True,
+                use_cache=True,
+                cache_position=cache_position
+            )
+        # Grab the single most likely token from each of the k sequences
+        next_token_logits = model_outs.logits[:, -1]
+        vocab_size = model.config.vocab_size
+        assert next_token_logits.shape == (n_branch_tokens, vocab_size), f"{next_token_logits.shape=}, {n_branch_tokens=}, {vocab_size=}"
+        most_likely_token_ids = next_token_logits.argmax(dim=-1)
+        hypotheses = torch.cat([
+            hypotheses,
+            most_likely_token_ids.unsqueeze(1)
+        ], dim=1)
+    generated_docs = tokenizer.batch_decode(hypotheses, skip_special_tokens=True)
+    return generated_docs

pyproject.toml CHANGED Viewed

@@ -3,9 +3,23 @@ name = "writing-prototypes"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
-requires-python = ">=3.10"
 dependencies = [
     "pandas>=2.2.3",
     "requests>=2.32.3",
     "streamlit==1.40.1",
 ]

 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
+requires-python = ">=3.11,<3.13"
 dependencies = [
+    "fastapi>=0.115.8",
     "pandas>=2.2.3",
+    "pydantic>=2.10.6",
     "requests>=2.32.3",
     "streamlit==1.40.1",
 ]
+[dependency-groups]
+gpu = [
+    "accelerate>=1.1.1",
+    "torch>=2.5.1",
+    "transformers>=4.46.2",
+    "tokenizers>=0.21.0",
+]
+dev = [
+    "ipython>=8.32.0",
+    "marimo>=0.10.6",
+]

test_llm_inference.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import pytest
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import custom_llm_inference
+from transformers.cache_utils import DynamicCache
+@pytest.fixture
+def model_and_tokenizer():
+    model_name = 'google/gemma-2-2b-it'
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    if tokenizer.bos_token_id is None:
+        tokenizer.bos_token_id = tokenizer.pad_token_id
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        device_map="cpu",
+        #torch_dtype=torch.float16
+    )
+    return model, tokenizer
+@pytest.fixture
+def sample_inputs():
+    doc = "The quick brown fox loves to jump over lazy dogs."
+    prompt = "Rewrite this document to make more sense."
+    doc_in_progress = "Sure, here's the document rewritten as requested:\n\nA fox,"
+    return doc, prompt, doc_in_progress
+def test_get_next_token_predictions(model_and_tokenizer, sample_inputs):
+    model, tokenizer = model_and_tokenizer
+    doc, prompt, doc_in_progress = sample_inputs
+    predictions = custom_llm_inference.get_next_token_predictions_slow(
+        model, tokenizer, doc, prompt, doc_in_progress=doc_in_progress, k=5
+    )
+    assert len(predictions) == 2  # Should return (token_texts, logits)
+    assert len(predictions[0]) == 5  # Should return k=5 predictions
+    assert predictions[1].shape[1] == model.config.vocab_size
+def test_get_tokenized_chat(model_and_tokenizer, sample_inputs):
+    model, tokenizer = model_and_tokenizer
+    doc, prompt, _ = sample_inputs
+    tokenized_chat = custom_llm_inference.get_tokenized_chat(tokenizer, prompt, doc)
+    assert isinstance(tokenized_chat, torch.Tensor)
+    assert tokenized_chat.dim() == 1
+    assert tokenized_chat.dtype == torch.int64
+def test_highlights(model_and_tokenizer, sample_inputs):
+    model, tokenizer = model_and_tokenizer
+    doc, prompt, updated_doc = sample_inputs
+    highlights = custom_llm_inference.get_highlights_inner(
+        model, tokenizer, doc, prompt, updated_doc=updated_doc, k=5
+    )
+    assert isinstance(highlights, list)
+    assert len(highlights) > 0
+    for h in highlights:
+        assert h['start'] >= 0
+        assert h['end'] >= h['start']
+        assert isinstance(h['token'], str)
+        assert isinstance(h['token_loss'], float)
+        assert isinstance(h['most_likely_token'], str)
+        assert isinstance(h['topk_tokens'], list)

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff