writing-prototypes

Running

App Files Files Community

kcarnold commited on about 18 hours ago

Commit

329490e

1 Parent(s): 9b8968e

Show a problem with the current approach.

Browse files

Files changed (3) hide show

pyproject.toml +1 -0
test_llm_inference.py +64 -1
uv.lock +35 -0

pyproject.toml CHANGED Viewed

@@ -8,6 +8,7 @@ dependencies = [
     "fastapi>=0.115.8",
     "pandas>=2.2.3",
     "pydantic>=2.10.6",
     "requests>=2.32.3",
     "streamlit==1.40.1",
 ]

     "fastapi>=0.115.8",
     "pandas>=2.2.3",
     "pydantic>=2.10.6",
+    "pytest>=8.3.4",
     "requests>=2.32.3",
     "streamlit==1.40.1",
 ]

test_llm_inference.py CHANGED Viewed

@@ -13,7 +13,7 @@ def model_and_tokenizer():
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         device_map="cpu",
-        #torch_dtype=torch.float16
     )
     return model, tokenizer
@@ -63,3 +63,66 @@ def test_highlights(model_and_tokenizer, sample_inputs):
         assert isinstance(h['token_loss'], float)
         assert isinstance(h['most_likely_token'], str)
         assert isinstance(h['topk_tokens'], list)

     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         device_map="cpu",
+        torch_dtype=torch.float16
     )
     return model, tokenizer
         assert isinstance(h['token_loss'], float)
         assert isinstance(h['most_likely_token'], str)
         assert isinstance(h['topk_tokens'], list)
+def compare_lookahead_predictions(model, tokenizer, doc, prompt, doc_in_progress, k=5):
+    """
+    Extracts and compares the next token predictions between the fast method and slow method.
+    Returns the differences between the two approaches for analysis.
+    """
+    # Get predictions from the fast method (using cache)
+    fast_tokens, fast_logits = custom_llm_inference.get_next_token_predictions_inner(
+        model, tokenizer, doc, prompt, doc_in_progress, k
+    )
+    # Get predictions from the slow method (recomputing for each token)
+    slow_tokens, slow_logits = custom_llm_inference.get_next_token_predictions_slow(
+        model, tokenizer, doc, prompt, doc_in_progress, k
+    )
+    # Compare the decoded tokens (this is what users will see)
+    token_matches = [fast == slow for fast, slow in zip(fast_tokens, slow_tokens)]
+    # Calculate the difference in logits for most likely next tokens
+    fast_most_likely = fast_logits.argmax(dim=-1)
+    slow_most_likely = slow_logits.argmax(dim=-1)
+    logit_match = torch.eq(fast_most_likely, slow_most_likely).cpu().numpy()
+    # Calculate numerical difference in logits
+    logit_diff_norm = torch.linalg.vector_norm((fast_logits - slow_logits).to(torch.float32), dim=1).cpu().numpy()
+    return {
+        "fast_tokens": fast_tokens,
+        "slow_tokens": slow_tokens,
+        "token_matches": token_matches,
+        "token_match_all": all(token_matches),
+        "logit_match": logit_match,
+        "logit_diff_norm": logit_diff_norm
+    }
+def test_lookahead_token_consistency(model_and_tokenizer, sample_inputs):
+    """
+    Test that demonstrates the potential issue with cache position indices
+    when generating lookahead tokens.
+    """
+    model, tokenizer = model_and_tokenizer
+    doc, prompt, doc_in_progress = sample_inputs
+    results = compare_lookahead_predictions(model, tokenizer, doc, prompt, doc_in_progress)
+    # Check if the tokens are the same
+    assert results["token_match_all"], (
+        f"Fast and slow methods produced different tokens.\n"
+        f"Fast: {results['fast_tokens']}\n"
+        f"Slow: {results['slow_tokens']}"
+    )
+    # Check if the most likely next tokens based on logits are the same
+    assert all(results["logit_match"]), (
+        f"Fast and slow methods predicted different most likely next tokens"
+    )
+    # Check that the logit differences are minimal
+    # This might fail if there's a bug in the cache position indices
+    assert all(diff < 1e-4 for diff in results["logit_diff_norm"]), (
+        f"Significant difference in logits between fast and slow methods: {results['logit_diff_norm']}"
+    )

uv.lock CHANGED Viewed

@@ -287,6 +287,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 },
 ]
 [[package]]
 name = "ipython"
 version = "8.32.0"
@@ -719,6 +728,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/0b/30/2b61876e2722374558b871dfbfcbe4e406626d63f4f6ed92e9c8e24cac37/pillow-11.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:27a7860107500d813fcd203b4ea19b04babe79448268403172782754870dac25", size = 2254890 },
 ]
 [[package]]
 name = "prompt-toolkit"
 version = "3.0.50"
@@ -917,6 +935,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/eb/f5/b9e2a42aa8f9e34d52d66de87941ecd236570c7ed2e87775ed23bbe4e224/pymdown_extensions-10.14.3-py3-none-any.whl", hash = "sha256:05e0bee73d64b9c71a4ae17c72abc2f700e8bc8403755a00580b49a4e9f189e9", size = 264467 },
 ]
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
@@ -1505,6 +1538,7 @@ dependencies = [
     { name = "fastapi" },
     { name = "pandas" },
     { name = "pydantic" },
     { name = "requests" },
     { name = "streamlit" },
 ]
@@ -1526,6 +1560,7 @@ requires-dist = [
     { name = "fastapi", specifier = ">=0.115.8" },
     { name = "pandas", specifier = ">=2.2.3" },
     { name = "pydantic", specifier = ">=2.10.6" },
     { name = "requests", specifier = ">=2.32.3" },
     { name = "streamlit", specifier = "==1.40.1" },
 ]

     { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 },
 ]
+[[package]]
+name = "iniconfig"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d7/4b/cbd8e699e64a6f16ca3a8220661b5f83792b3017d0f79807cb8708d33913/iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3", size = 4646 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 },
+]
 [[package]]
 name = "ipython"
 version = "8.32.0"
     { url = "https://files.pythonhosted.org/packages/0b/30/2b61876e2722374558b871dfbfcbe4e406626d63f4f6ed92e9c8e24cac37/pillow-11.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:27a7860107500d813fcd203b4ea19b04babe79448268403172782754870dac25", size = 2254890 },
 ]
+[[package]]
+name = "pluggy"
+version = "1.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 },
+]
 [[package]]
 name = "prompt-toolkit"
 version = "3.0.50"
     { url = "https://files.pythonhosted.org/packages/eb/f5/b9e2a42aa8f9e34d52d66de87941ecd236570c7ed2e87775ed23bbe4e224/pymdown_extensions-10.14.3-py3-none-any.whl", hash = "sha256:05e0bee73d64b9c71a4ae17c72abc2f700e8bc8403755a00580b49a4e9f189e9", size = 264467 },
 ]
+[[package]]
+name = "pytest"
+version = "8.3.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "iniconfig" },
+    { name = "packaging" },
+    { name = "pluggy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/05/35/30e0d83068951d90a01852cb1cef56e5d8a09d20c7f511634cc2f7e0372a/pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761", size = 1445919 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/11/92/76a1c94d3afee238333bc0a42b82935dd8f9cf8ce9e336ff87ee14d9e1cf/pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6", size = 343083 },
+]
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
     { name = "fastapi" },
     { name = "pandas" },
     { name = "pydantic" },
+    { name = "pytest" },
     { name = "requests" },
     { name = "streamlit" },
 ]
     { name = "fastapi", specifier = ">=0.115.8" },
     { name = "pandas", specifier = ">=2.2.3" },
     { name = "pydantic", specifier = ">=2.10.6" },
+    { name = "pytest", specifier = ">=8.3.4" },
     { name = "requests", specifier = ">=2.32.3" },
     { name = "streamlit", specifier = "==1.40.1" },
 ]