{"docstore/metadata": {"6bd29218-fa1f-41b9-801c-4a309986263c": {"doc_hash": "c8b90a4763410f5d9299c7169bdb7e5d27285c010d14a97a55674200ebd4f7d9"}, "044a8f5d-3e9b-452b-81e0-dd7fb55823a8": {"doc_hash": "036f8365840d07cb6325fe6ce714e3e95163bbba2efb9e0271e9c01f17ab4314"}, "6731a091-04a0-42ab-b648-f71f1b3f4e83": {"doc_hash": "0214331b38b1eab1f4df5dc82384ef1307487aec4461ff518da05e862f646b0e"}, "478cc06b-9553-4eea-96a1-0134b150348a": {"doc_hash": "d8aeb72de6eff728d7b517da670afb88bfc36fb4ae7f92d7fa2c0931c734d44a"}, "da6c62ab-1a0f-441e-914a-187cc01a2daa": {"doc_hash": "7acdbe56b13bac20f672b675d82b51f491e07084962f1efd885a3cca0aa06aed"}, "90338cfe-1bf2-4827-932a-096909ab9872": {"doc_hash": "444d8895e95f618363affd6b8b4ceb78f3812578942ed3f024d3553c6c6389b2"}, "55b6a4bb-4ea6-4be6-abe5-13c3836c9b45": {"doc_hash": "e718d31a9269875e567c7e7cd5415223c1583b3a8c5afca275f047acecbb9305"}, "a30ff13f-7b36-4e5a-9136-78a519486858": {"doc_hash": "59ca232a1eafe6d02c1682212046131cc28120b0969fc1520f3d034d1157d803"}, "13cf01d1-2f55-4058-9894-15fdc7e35c53": {"doc_hash": "b6dcf65a26aa4dd46a4c35eb38773e8d8bddde9bc1208b3a9bc6cc062a753f29"}, "c8957502-16b6-4315-ae07-380d414096aa": {"doc_hash": "71dd463aa6918f631aced752f45b398365bf9fea6c940d6cd2bd68f3f5f27611"}, "040471c5-5c98-4579-9b44-8e3341a925a8": {"doc_hash": "065b4f45f79ac2ef8c64969651c5f057b8ca856739025e0afd58e1dc5d98ffd0"}, "6a3c28fb-12c5-40a9-b031-48d4cd59dfde": {"doc_hash": "f23883e0ad0a4c667d9be13737911d5c7483c6be5d8a121f9f876ac0f27c3d09"}, "47d16afe-8908-4bcf-a028-8d172369f372": {"doc_hash": "2c307026fd7638d6941c17055ee80f32fe3a6a91be2d489c74de1bc2ddcbcf59"}, "548c017c-00a3-4857-98bb-e9aa5c8182e4": {"doc_hash": "8e55790f5185aa0aa3775dc612146b7947b6e6525d8072dc583dfe7d6ce50340"}, "a0b037f8-b3de-44b1-bb1a-7c82f59469e5": {"doc_hash": "390f50475780109546a85dbd6ff1da83be120abbff8ce533acd3e83987a5690f"}, "f2aa086e-b546-4f44-8dba-69293c4cb9a5": {"doc_hash": "2c6a57699739f09affe34a1c18561bfba348306bf4792e7dca4163cdf7349e4d"}, "37191111-9247-427c-a43b-7f1a6ff23a3a": {"doc_hash": "b2795736168ab74c76b5f60425ef8d80718f4f57dee8dcac4f824451393ae79a"}, "4d507172-589c-4f01-9292-492b28b20cce": {"doc_hash": "a2510024218bbdd1221a5ffa5eb6f97e06297bee1cdcbe3abfe2df0c61bfd4f6"}, "7e1a96ce-5cb2-47f7-8971-90428d7e4e2a": {"doc_hash": "a251b7a4276eb22942398151ee4c105631f321688f387dc5c94fd9dda10e6dfc"}, "e3a3f0cc-7af7-4746-820c-6ce4a054fb49": {"doc_hash": "5e5fd1f3262954d70dececa0a8c62877a2ceb59b672656b04a3f68a9650938ef"}, "e705f9a7-c5f5-4c0b-941f-b1d9d5301b3e": {"doc_hash": "0620d390152142bad87f021961c874500f0269384b45ea10521b1d71a8cb5680", "ref_doc_id": "6bd29218-fa1f-41b9-801c-4a309986263c"}, "5060d247-f056-4a31-b81d-28a5dea38624": {"doc_hash": "0d566fd5ae551989d4f1bfb83ffbd6263329ad1ea1a4c3b771ef45b3e9fc87fa", "ref_doc_id": "044a8f5d-3e9b-452b-81e0-dd7fb55823a8"}, "12aec01a-74aa-474b-8c31-5ebfc27d3234": {"doc_hash": "e9a5f971f16c25f1c2a6debd935d82b7eafd172414b7654d502f00c51f46f0ec", "ref_doc_id": "6731a091-04a0-42ab-b648-f71f1b3f4e83"}, "e8afa57e-0f27-483c-8f64-c2e198cfe7c0": {"doc_hash": "0a0ad70325d4c9ab1e550ee3e14c8f7159bfc74dafac45d1215b19a922893a9e", "ref_doc_id": "478cc06b-9553-4eea-96a1-0134b150348a"}, "fdad2622-2190-4427-9067-c31534e49d59": {"doc_hash": "b28a7c1377be5823409d25c21760a44f26907e121fbe74b27fb9dcb76e9af3d1", "ref_doc_id": "da6c62ab-1a0f-441e-914a-187cc01a2daa"}, "71eaa072-dc72-4228-84a0-6935bfce3f98": {"doc_hash": "b43bdcbf3e00af41f68125f2544c965e87a39a81868703e12ac1b789e0e64beb", "ref_doc_id": "90338cfe-1bf2-4827-932a-096909ab9872"}, "e4fe9c90-10d4-4cd5-a12d-51d7905bbb33": {"doc_hash": "87929ef3f9851332fd60d4609e70735ab31a09110a50f5595845f8d7d61cabf3", "ref_doc_id": "55b6a4bb-4ea6-4be6-abe5-13c3836c9b45"}, "b646165e-9c51-4eff-a62b-4f37e46872b9": {"doc_hash": "233741f2ee8fe5cfc114a6ef22bb3128901a583a3833ea541bcb0a01498bfc00", "ref_doc_id": "a30ff13f-7b36-4e5a-9136-78a519486858"}, "c29e002e-43e1-44bf-b179-63a039627571": {"doc_hash": "36959e65f66bda665c29188211238edeb5236dd42283add02a4ff1f6e74574fb", "ref_doc_id": "13cf01d1-2f55-4058-9894-15fdc7e35c53"}, "414c6224-f841-47a4-8c80-558c49018337": {"doc_hash": "a259d3e5bfa4bb8101e9d8c4b65c93c06c725eb3fbc2e6ca6a463e9f87cdc161", "ref_doc_id": "c8957502-16b6-4315-ae07-380d414096aa"}, "1091bcbd-fd6d-4039-aadd-b241ceff6b7a": {"doc_hash": "3ed8dc8bc7c153acd0601728154bcb2a7999dababbc3f4fc4a4edd33ddc9de3c", "ref_doc_id": "040471c5-5c98-4579-9b44-8e3341a925a8"}, "71297e5e-93ef-4382-a45b-5f43c5f36ade": {"doc_hash": "615ef6f6a320de4e8ca2f82cf5444f6539d582b40b2bebca375b7469c2002d26", "ref_doc_id": "6a3c28fb-12c5-40a9-b031-48d4cd59dfde"}, "72f77e9a-1096-48b5-a14b-01422cff5880": {"doc_hash": "52d9343a8597669773b36dd37bd0efe3df09e1f3639ebb0f7ca40c51f85d295d", "ref_doc_id": "47d16afe-8908-4bcf-a028-8d172369f372"}, "b7d9a32c-188d-497c-805d-e0e7b2b39b3e": {"doc_hash": "3b1fc9c245080eefeb510b07e559ba3a1536c03cfe1c00b581734d15b2dd806f", "ref_doc_id": "548c017c-00a3-4857-98bb-e9aa5c8182e4"}, "02f51336-3954-4b74-b068-899e2322d368": {"doc_hash": "f05267f8ef16e2766a84842151f48232681c31b93bba04a5d87625be23959dd9", "ref_doc_id": "a0b037f8-b3de-44b1-bb1a-7c82f59469e5"}, "364bc31a-a379-4c49-a666-a56ceb46705b": {"doc_hash": "da8434e2b637a445c2a76c32938818fe45676a6dd523ad18f78e7b4af80637af", "ref_doc_id": "f2aa086e-b546-4f44-8dba-69293c4cb9a5"}, "73ed22dc-c89b-4d52-a35b-3b1a59ae4eb2": {"doc_hash": "27d783774b40875536feb0213d013446070ae7269732ea3adbf3c6472dc6a06a", "ref_doc_id": "37191111-9247-427c-a43b-7f1a6ff23a3a"}, "b2dd87cb-2c16-4133-88fd-11dd778b54d8": {"doc_hash": "212b6bf16abbcff2538ec482f136f97d47667953c9b42d504ceac1dd70790b67", "ref_doc_id": "4d507172-589c-4f01-9292-492b28b20cce"}, "894fe9ba-ff1c-4dac-ad2e-e7c3a990814c": {"doc_hash": "04380f6281eb7062bf843677dfa69d8974b6e965463b90c268f1d4002555b668", "ref_doc_id": "7e1a96ce-5cb2-47f7-8971-90428d7e4e2a"}, "aba7eaaf-67eb-4168-825e-9c2cc802e04d": {"doc_hash": "07ff582d3d01f2af70000eb72d7dc425599c60031f9bff1e5a6f0ea0aa4cefe4", "ref_doc_id": "e3a3f0cc-7af7-4746-820c-6ce4a054fb49"}}, "docstore/data": {"e705f9a7-c5f5-4c0b-941f-b1d9d5301b3e": {"__data__": {"id_": "e705f9a7-c5f5-4c0b-941f-b1d9d5301b3e", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "6bd29218-fa1f-41b9-801c-4a309986263c", "node_type": "4", "metadata": {}, "hash": "c8b90a4763410f5d9299c7169bdb7e5d27285c010d14a97a55674200ebd4f7d9", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Title: Tree-of-Code: A Tree-Structured Exploring Framework for End-to-End Code Generation and Execution in Complex Task Handling\nAuthors: Ziyi Ni, Yifan Li, Ning Yang, Dou Shen, Pin Lv, Daxiang Dong\nSummary: Solving complex reasoning tasks is a key real-world application of agents.\nThanks to the pretraining of Large Language Models (LLMs) on code data, recent\napproaches like CodeAct successfully use code as LLM agents' action, achieving\ngood results. However, CodeAct greedily generates the next action's code block\nby relying on fragmented thoughts, resulting in inconsistency and instability.\nMoreover, CodeAct lacks action-related ground-truth (GT), making its\nsupervision signals and termination conditions questionable in multi-turn\ninteractions. To address these issues, we first introduce a simple yet\neffective end-to-end code generation paradigm, CodeProgram, which leverages\ncode's systematic logic to align with global reasoning and enable cohesive\nproblem-solving. Then, we propose Tree-of-Code (ToC), which self-grows\nCodeProgram nodes based on the executable nature of the code and enables\nself-supervision in a GT-free scenario. Experimental results on two datasets\nusing ten popular zero-shot LLMs show ToC remarkably boosts accuracy by nearly\n20% over CodeAct with less than 1/4 turns. Several LLMs even perform better on\none-turn CodeProgram than on multi-turn CodeAct. To further investigate the\ntrade-off between efficacy and efficiency, we test different ToC tree sizes and\nexploration mechanisms. We also highlight the potential of ToC's end-to-end\ndata generation for supervised and reinforced fine-tuning.\nPublished: 2024-12-19 12:31:22+00:00\nJournal Reference: None\nDOI: None\nPrimary Category: cs.SE\nCategories: cs.SE, cs.AI\nPDF URL: http://arxiv.org/pdf/2412.15305v1\narXiv URL: http://arxiv.org/abs/2412.15305v1", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 1841, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "5060d247-f056-4a31-b81d-28a5dea38624": {"__data__": {"id_": "5060d247-f056-4a31-b81d-28a5dea38624", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "044a8f5d-3e9b-452b-81e0-dd7fb55823a8", "node_type": "4", "metadata": {}, "hash": "036f8365840d07cb6325fe6ce714e3e95163bbba2efb9e0271e9c01f17ab4314", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Title: On Verbalized Confidence Scores for LLMs\nAuthors: Daniel Yang, Yao-Hung Hubert Tsai, Makoto Yamada\nSummary: The rise of large language models (LLMs) and their tight integration into our\ndaily life make it essential to dedicate efforts towards their trustworthiness.\nUncertainty quantification for LLMs can establish more human trust into their\nresponses, but also allows LLM agents to make more informed decisions based on\neach other's uncertainty. To estimate the uncertainty in a response, internal\ntoken logits, task-specific proxy models, or sampling of multiple responses are\ncommonly used. This work focuses on asking the LLM itself to verbalize its\nuncertainty with a confidence score as part of its output tokens, which is a\npromising way for prompt- and model-agnostic uncertainty quantification with\nlow overhead. Using an extensive benchmark, we assess the reliability of\nverbalized confidence scores with respect to different datasets, models, and\nprompt methods. Our results reveal that the reliability of these scores\nstrongly depends on how the model is asked, but also that it is possible to\nextract well-calibrated confidence scores with certain prompt methods. We argue\nthat verbalized confidence scores can become a simple but effective and\nversatile uncertainty quantification method in the future. Our code is\navailable at https://github.com/danielyxyang/llm-verbalized-uq .\nPublished: 2024-12-19 11:10:36+00:00\nJournal Reference: None\nDOI: None\nPrimary Category: cs.CL\nCategories: cs.CL\nPDF URL: http://arxiv.org/pdf/2412.14737v1\narXiv URL: http://arxiv.org/abs/2412.14737v1", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 1603, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "12aec01a-74aa-474b-8c31-5ebfc27d3234": {"__data__": {"id_": "12aec01a-74aa-474b-8c31-5ebfc27d3234", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "6731a091-04a0-42ab-b648-f71f1b3f4e83", "node_type": "4", "metadata": {}, "hash": "0214331b38b1eab1f4df5dc82384ef1307487aec4461ff518da05e862f646b0e", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Title: Agent-SafetyBench: Evaluating the Safety of LLM Agents\nAuthors: Zhexin Zhang, Shiyao Cui, Yida Lu, Jingzhuo Zhou, Junxiao Yang, Hongning Wang, Minlie Huang\nSummary: As large language models (LLMs) are increasingly deployed as agents, their\nintegration into interactive environments and tool use introduce new safety\nchallenges beyond those associated with the models themselves. However, the\nabsence of comprehensive benchmarks for evaluating agent safety presents a\nsignificant barrier to effective assessment and further improvement. In this\npaper, we introduce Agent-SafetyBench, a comprehensive benchmark designed to\nevaluate the safety of LLM agents. Agent-SafetyBench encompasses 349\ninteraction environments and 2,000 test cases, evaluating 8 categories of\nsafety risks and covering 10 common failure modes frequently encountered in\nunsafe interactions. Our evaluation of 16 popular LLM agents reveals a\nconcerning result: none of the agents achieves a safety score above 60%. This\nhighlights significant safety challenges in LLM agents and underscores the\nconsiderable need for improvement. Through quantitative analysis, we identify\ncritical failure modes and summarize two fundamental safety detects in current\nLLM agents: lack of robustness and lack of risk awareness. Furthermore, our\nfindings suggest that reliance on defense prompts alone is insufficient to\naddress these safety issues, emphasizing the need for more advanced and robust\nstrategies. We release Agent-SafetyBench at\n\\url{https://github.com/thu-coai/Agent-SafetyBench} to facilitate further\nresearch and innovation in agent safety evaluation and improvement.\nPublished: 2024-12-19 02:35:15+00:00\nJournal Reference: None\nDOI: None\nPrimary Category: cs.CL\nCategories: cs.CL\nPDF URL: http://arxiv.org/pdf/2412.14470v1\narXiv URL: http://arxiv.org/abs/2412.14470v1", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 1844, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "e8afa57e-0f27-483c-8f64-c2e198cfe7c0": {"__data__": {"id_": "e8afa57e-0f27-483c-8f64-c2e198cfe7c0", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "478cc06b-9553-4eea-96a1-0134b150348a", "node_type": "4", "metadata": {}, "hash": "d8aeb72de6eff728d7b517da670afb88bfc36fb4ae7f92d7fa2c0931c734d44a", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Title: TheAgentCompany: Benchmarking LLM Agents on Consequential Real World Tasks\nAuthors: Frank F. Xu, Yufan Song, Boxuan Li, Yuxuan Tang, Kritanjali Jain, Mengxue Bao, Zora Z. Wang, Xuhui Zhou, Zhitong Guo, Murong Cao, Mingyang Yang, Hao Yang Lu, Amaad Martin, Zhe Su, Leander Maben, Raj Mehta, Wayne Chi, Lawrence Jang, Yiqing Xie, Shuyan Zhou, Graham Neubig\nSummary: We interact with computers on an everyday basis, be it in everyday life or\nwork, and many aspects of work can be done entirely with access to a computer\nand the Internet. At the same time, thanks to improvements in large language\nmodels (LLMs), there has also been a rapid development in AI agents that\ninteract with and affect change in their surrounding environments. But how\nperformant are AI agents at helping to accelerate or even autonomously perform\nwork-related tasks? The answer to this question has important implications for\nboth industry looking to adopt AI into their workflows, and for economic policy\nto understand the effects that adoption of AI may have on the labor market. To\nmeasure the progress of these LLM agents' performance on performing real-world\nprofessional tasks, in this paper, we introduce TheAgentCompany, an extensible\nbenchmark for evaluating AI agents that interact with the world in similar ways\nto those of a digital worker: by browsing the Web, writing code, running\nprograms, and communicating with other coworkers. We build a self-contained\nenvironment with internal web sites and data that mimics a small software\ncompany environment, and create a variety of tasks that may be performed by\nworkers in such a company. We test baseline agents powered by both closed\nAPI-based and open-weights language models (LMs), and find that with the most\ncompetitive agent, 24% of the tasks can be completed autonomously. This paints\na nuanced picture on task automation with LM agents -- in a setting simulating\na real workplace, a good portion of simpler tasks could be solved autonomously,\nbut more difficult long-horizon tasks are still beyond the reach of current\nsystems.\nPublished: 2024-12-18 18:55:40+00:00\nJournal Reference: None\nDOI: None\nPrimary Category: cs.CL\nCategories: cs.CL\nPDF URL: http://arxiv.org/pdf/2412.14161v1\narXiv URL: http://arxiv.org/abs/2412.14161v1", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 2278, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "fdad2622-2190-4427-9067-c31534e49d59": {"__data__": {"id_": "fdad2622-2190-4427-9067-c31534e49d59", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "da6c62ab-1a0f-441e-914a-187cc01a2daa", "node_type": "4", "metadata": {}, "hash": "7acdbe56b13bac20f672b675d82b51f491e07084962f1efd885a3cca0aa06aed", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Title: Exploring Multi-Modal Integration with Tool-Augmented LLM Agents for Precise Causal Discovery\nAuthors: ChengAo Shen, Zhengzhang Chen, Dongsheng Luo, Dongkuan Xu, Haifeng Chen, Jingchao Ni\nSummary: Causal inference is an imperative foundation for decision-making across\ndomains, such as smart health, AI for drug discovery and AIOps. Traditional\nstatistical causal discovery methods, while well-established, predominantly\nrely on observational data and often overlook the semantic cues inherent in\ncause-and-effect relationships. The advent of Large Language Models (LLMs) has\nushered in an affordable way of leveraging the semantic cues for\nknowledge-driven causal discovery, but the development of LLMs for causal\ndiscovery lags behind other areas, particularly in the exploration of\nmulti-modality data. To bridge the gap, we introduce MATMCD, a multi-agent\nsystem powered by tool-augmented LLMs. MATMCD has two key agents: a Data\nAugmentation agent that retrieves and processes modality-augmented data, and a\nCausal Constraint agent that integrates multi-modal data for knowledge-driven\ninference. Delicate design of the inner-workings ensures successful cooperation\nof the agents. Our empirical study across seven datasets suggests the\nsignificant potential of multi-modality enhanced causal discovery.\nPublished: 2024-12-18 09:50:00+00:00\nJournal Reference: None\nDOI: None\nPrimary Category: cs.LG\nCategories: cs.LG, cs.AI, stat.ME\nPDF URL: http://arxiv.org/pdf/2412.13667v1\narXiv URL: http://arxiv.org/abs/2412.13667v1", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 1530, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "71eaa072-dc72-4228-84a0-6935bfce3f98": {"__data__": {"id_": "71eaa072-dc72-4228-84a0-6935bfce3f98", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "90338cfe-1bf2-4827-932a-096909ab9872", "node_type": "4", "metadata": {}, "hash": "444d8895e95f618363affd6b8b4ceb78f3812578942ed3f024d3553c6c6389b2", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Title: Tree-of-Code: A Hybrid Approach for Robust Complex Task Planning and Execution\nAuthors: Ziyi Ni, Yifan Li, Daxiang Dong\nSummary: The exceptional capabilities of large language models (LLMs) have\nsubstantially accelerated the rapid rise and widespread adoption of agents.\nRecent studies have demonstrated that generating Python code to consolidate\nLLM-based agents' actions into a unified action space (CodeAct) is a promising\napproach for developing real-world LLM agents. However, this step-by-step code\ngeneration approach often lacks consistency and robustness, leading to\ninstability in agent applications, particularly for complex reasoning and\nout-of-domain tasks. In this paper, we propose a novel approach called\nTree-of-Code (ToC) to tackle the challenges of complex problem planning and\nexecution with an end-to-end mechanism. By integrating key ideas from both\nTree-of-Thought and CodeAct, ToC combines their strengths to enhance solution\nexploration. In our framework, each final code execution result is treated as a\nnode in the decision tree, with a breadth-first search strategy employed to\nexplore potential solutions. The final outcome is determined through a voting\nmechanism based on the outputs of the nodes.\nPublished: 2024-12-18 08:47:17+00:00\nJournal Reference: None\nDOI: None\nPrimary Category: cs.SE\nCategories: cs.SE, cs.AI\nPDF URL: http://arxiv.org/pdf/2412.14212v1\narXiv URL: http://arxiv.org/abs/2412.14212v1", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 1443, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "e4fe9c90-10d4-4cd5-a12d-51d7905bbb33": {"__data__": {"id_": "e4fe9c90-10d4-4cd5-a12d-51d7905bbb33", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "55b6a4bb-4ea6-4be6-abe5-13c3836c9b45", "node_type": "4", "metadata": {}, "hash": "e718d31a9269875e567c7e7cd5415223c1583b3a8c5afca275f047acecbb9305", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Title: SafeAgentBench: A Benchmark for Safe Task Planning of Embodied LLM Agents\nAuthors: Sheng Yin, Xianghe Pang, Yuanzhuo Ding, Menglan Chen, Yutong Bi, Yichen Xiong, Wenhao Huang, Zhen Xiang, Jing Shao, Siheng Chen\nSummary: With the integration of large language models (LLMs), embodied agents have\nstrong capabilities to execute complicated instructions in natural language,\npaving a way for the potential deployment of embodied robots. However, a\nforeseeable issue is that those embodied agents can also flawlessly execute\nsome hazardous tasks, potentially causing damages in real world. To study this\nissue, we present SafeAgentBench -- a new benchmark for safety-aware task\nplanning of embodied LLM agents. SafeAgentBench includes: (1) a new dataset\nwith 750 tasks, covering 10 potential hazards and 3 task types; (2)\nSafeAgentEnv, a universal embodied environment with a low-level controller,\nsupporting multi-agent execution with 17 high-level actions for 8\nstate-of-the-art baselines; and (3) reliable evaluation methods from both\nexecution and semantic perspectives. Experimental results show that the\nbest-performing baseline gets 69% success rate for safe tasks, but only 5%\nrejection rate for hazardous tasks, indicating significant safety risks. More\ndetails and codes are available at\nhttps://github.com/shengyin1224/SafeAgentBench.\nPublished: 2024-12-17 18:55:58+00:00\nJournal Reference: None\nDOI: None\nPrimary Category: cs.CR\nCategories: cs.CR, cs.AI, cs.RO\nPDF URL: http://arxiv.org/pdf/2412.13178v2\narXiv URL: http://arxiv.org/abs/2412.13178v2", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 1563, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "b646165e-9c51-4eff-a62b-4f37e46872b9": {"__data__": {"id_": "b646165e-9c51-4eff-a62b-4f37e46872b9", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "a30ff13f-7b36-4e5a-9136-78a519486858", "node_type": "4", "metadata": {}, "hash": "59ca232a1eafe6d02c1682212046131cc28120b0969fc1520f3d034d1157d803", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Title: Memory-Augmented Agent Training for Business Document Understanding\nAuthors: Jiale Liu, Yifan Zeng, Malte H\u00f8jmark-Bertelsen, Marie Normann Gadeberg, Huazheng Wang, Qingyun Wu\nSummary: Traditional enterprises face significant challenges in processing business\ndocuments, where tasks like extracting transport references from invoices\nremain largely manual despite their crucial role in logistics operations. While\nLarge Language Models offer potential automation, their direct application to\nspecialized business domains often yields unsatisfactory results. We introduce\nMatrix (Memory-Augmented agent Training through Reasoning and Iterative\neXploration), a novel paradigm that enables LLM agents to progressively build\ndomain expertise through experience-driven memory refinement and iterative\nlearning. To validate this approach, we collaborate with one of the world's\nlargest logistics companies to create a dataset of Universal Business Language\nformat invoice documents, focusing on the task of transport reference\nextraction. Experiments demonstrate that Matrix outperforms prompting a single\nLLM by 30.3%, vanilla LLM agent by 35.2%. We further analyze the metrics of the\noptimized systems and observe that the agent system requires less API calls,\nfewer costs and can analyze longer documents on average. Our methods establish\na new approach to transform general-purpose LLMs into specialized business\ntools through systematic memory enhancement in document processing tasks.\nPublished: 2024-12-17 18:35:04+00:00\nJournal Reference: None\nDOI: None\nPrimary Category: cs.CL\nCategories: cs.CL, cs.AI\nPDF URL: http://arxiv.org/pdf/2412.15274v1\narXiv URL: http://arxiv.org/abs/2412.15274v1", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 1698, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "c29e002e-43e1-44bf-b179-63a039627571": {"__data__": {"id_": "c29e002e-43e1-44bf-b179-63a039627571", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "13cf01d1-2f55-4058-9894-15fdc7e35c53", "node_type": "4", "metadata": {}, "hash": "b6dcf65a26aa4dd46a4c35eb38773e8d8bddde9bc1208b3a9bc6cc062a753f29", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Title: On the Structural Memory of LLM Agents\nAuthors: Ruihong Zeng, Jinyuan Fang, Siwei Liu, Zaiqiao Meng\nSummary: Memory plays a pivotal role in enabling large language model~(LLM)-based\nagents to engage in complex and long-term interactions, such as question\nanswering (QA) and dialogue systems. While various memory modules have been\nproposed for these tasks, the impact of different memory structures across\ntasks remains insufficiently explored. This paper investigates how memory\nstructures and memory retrieval methods affect the performance of LLM-based\nagents. Specifically, we evaluate four types of memory structures, including\nchunks, knowledge triples, atomic facts, and summaries, along with mixed memory\nthat combines these components. In addition, we evaluate three widely used\nmemory retrieval methods: single-step retrieval, reranking, and iterative\nretrieval. Extensive experiments conducted across four tasks and six datasets\nyield the following key insights: (1) Different memory structures offer\ndistinct advantages, enabling them to be tailored to specific tasks; (2) Mixed\nmemory structures demonstrate remarkable resilience in noisy environments; (3)\nIterative retrieval consistently outperforms other methods across various\nscenarios. Our investigation aims to inspire further research into the design\nof memory systems for LLM-based agents.\nPublished: 2024-12-17 04:30:00+00:00\nJournal Reference: None\nDOI: None\nPrimary Category: cs.CL\nCategories: cs.CL, cs.AI\nPDF URL: http://arxiv.org/pdf/2412.15266v1\narXiv URL: http://arxiv.org/abs/2412.15266v1", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 1576, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "414c6224-f841-47a4-8c80-558c49018337": {"__data__": {"id_": "414c6224-f841-47a4-8c80-558c49018337", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "c8957502-16b6-4315-ae07-380d414096aa", "node_type": "4", "metadata": {}, "hash": "71dd463aa6918f631aced752f45b398365bf9fea6c940d6cd2bd68f3f5f27611", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Title: Codenames as a Benchmark for Large Language Models\nAuthors: Matthew Stephenson, Matthew Sidji, Beno\u00eet Ronval\nSummary: In this paper, we propose the use of the popular word-based board game\nCodenames as a suitable benchmark for evaluating the reasoning capabilities of\nLarge Language Models (LLMs). Codenames presents a highly interesting challenge\nfor achieving successful AI performance, requiring both a sophisticated\nunderstanding of language, theory of mind, and epistemic reasoning\ncapabilities. Prior attempts to develop agents for Codenames have largely\nrelied on word embedding techniques, which have a limited vocabulary range and\nperform poorly when paired with differing approaches. LLMs have demonstrated\nenhanced reasoning and comprehension capabilities for language-based tasks, but\ncan still suffer in lateral thinking challenges. We evaluate the capabilities\nof several state-of-the-art LLMs, including GPT-4o, Gemini 1.5, Claude 3.5\nSonnet, and Llama 3.1, across a variety of board setups. Our results indicate\nthat while certain LLMs perform better than others overall, different models\nexhibit varying emergent behaviours during gameplay and excel at specific\nroles. We also evaluate the performance of different combinations of LLMs when\nplaying cooperatively together, demonstrating that LLM agents are more\ngeneralisable to a wider range of teammates than prior techniques.\nPublished: 2024-12-16 01:59:03+00:00\nJournal Reference: None\nDOI: None\nPrimary Category: cs.AI\nCategories: cs.AI, cs.CL\nPDF URL: http://arxiv.org/pdf/2412.11373v1\narXiv URL: http://arxiv.org/abs/2412.11373v1", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 1610, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "1091bcbd-fd6d-4039-aadd-b241ceff6b7a": {"__data__": {"id_": "1091bcbd-fd6d-4039-aadd-b241ceff6b7a", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "040471c5-5c98-4579-9b44-8e3341a925a8", "node_type": "4", "metadata": {}, "hash": "065b4f45f79ac2ef8c64969651c5f057b8ca856739025e0afd58e1dc5d98ffd0", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Title: Cultural Evolution of Cooperation among LLM Agents\nAuthors: Aron Vallinder, Edward Hughes\nSummary: Large language models (LLMs) provide a compelling foundation for building\ngenerally-capable AI agents. These agents may soon be deployed at scale in the\nreal world, representing the interests of individual humans (e.g., AI\nassistants) or groups of humans (e.g., AI-accelerated corporations). At\npresent, relatively little is known about the dynamics of multiple LLM agents\ninteracting over many generations of iterative deployment. In this paper, we\nexamine whether a \"society\" of LLM agents can learn mutually beneficial social\nnorms in the face of incentives to defect, a distinctive feature of human\nsociality that is arguably crucial to the success of civilization. In\nparticular, we study the evolution of indirect reciprocity across generations\nof LLM agents playing a classic iterated Donor Game in which agents can observe\nthe recent behavior of their peers. We find that the evolution of cooperation\ndiffers markedly across base models, with societies of Claude 3.5 Sonnet agents\nachieving significantly higher average scores than Gemini 1.5 Flash, which, in\nturn, outperforms GPT-4o. Further, Claude 3.5 Sonnet can make use of an\nadditional mechanism for costly punishment to achieve yet higher scores, while\nGemini 1.5 Flash and GPT-4o fail to do so. For each model class, we also\nobserve variation in emergent behavior across random seeds, suggesting an\nunderstudied sensitive dependence on initial conditions. We suggest that our\nevaluation regime could inspire an inexpensive and informative new class of LLM\nbenchmarks, focussed on the implications of LLM agent deployment for the\ncooperative infrastructure of society.\nPublished: 2024-12-13 16:45:49+00:00\nJournal Reference: None\nDOI: None\nPrimary Category: cs.MA\nCategories: cs.MA, cs.AI\nPDF URL: http://arxiv.org/pdf/2412.10270v1\narXiv URL: http://arxiv.org/abs/2412.10270v1", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 1948, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "71297e5e-93ef-4382-a45b-5f43c5f36ade": {"__data__": {"id_": "71297e5e-93ef-4382-a45b-5f43c5f36ade", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "6a3c28fb-12c5-40a9-b031-48d4cd59dfde", "node_type": "4", "metadata": {}, "hash": "f23883e0ad0a4c667d9be13737911d5c7483c6be5d8a121f9f876ac0f27c3d09", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Title: ROUTE: Robust Multitask Tuning and Collaboration for Text-to-SQL\nAuthors: Yang Qin, Chao Chen, Zhihang Fu, Ze Chen, Dezhong Peng, Peng Hu, Jieping Ye\nSummary: Despite the significant advancements in Text-to-SQL (Text2SQL) facilitated by\nlarge language models (LLMs), the latest state-of-the-art techniques are still\ntrapped in the in-context learning of closed-source LLMs (e.g., GPT-4), which\nlimits their applicability in open scenarios. To address this challenge, we\npropose a novel RObust mUltitask Tuning and collaboration mEthod (ROUTE) to\nimprove the comprehensive capabilities of open-source LLMs for Text2SQL,\nthereby providing a more practical solution. Our approach begins with\nmulti-task supervised fine-tuning (SFT) using various synthetic training data\nrelated to SQL generation. Unlike existing SFT-based Text2SQL methods, we\nintroduced several additional SFT tasks, including schema linking, noise\ncorrection, and continuation writing. Engaging in a variety of SQL generation\ntasks enhances the model's understanding of SQL syntax and improves its ability\nto generate high-quality SQL queries. Additionally, inspired by the\ncollaborative modes of LLM agents, we introduce a Multitask Collaboration\nPrompting (MCP) strategy. This strategy leverages collaboration across several\nSQL-related tasks to reduce hallucinations during SQL generation, thereby\nmaximizing the potential of enhancing Text2SQL performance through explicit\nmultitask capabilities. Extensive experiments and in-depth analyses have been\nperformed on eight open-source LLMs and five widely-used benchmarks. The\nresults demonstrate that our proposal outperforms the latest Text2SQL methods\nand yields leading performance.\nPublished: 2024-12-13 13:41:18+00:00\nJournal Reference: None\nDOI: None\nPrimary Category: cs.CL\nCategories: cs.CL, cs.AI\nPDF URL: http://arxiv.org/pdf/2412.10138v1\narXiv URL: http://arxiv.org/abs/2412.10138v1", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 1918, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "72f77e9a-1096-48b5-a14b-01422cff5880": {"__data__": {"id_": "72f77e9a-1096-48b5-a14b-01422cff5880", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "47d16afe-8908-4bcf-a028-8d172369f372", "node_type": "4", "metadata": {}, "hash": "2c307026fd7638d6941c17055ee80f32fe3a6a91be2d489c74de1bc2ddcbcf59", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Title: You Name It, I Run It: An LLM Agent to Execute Tests of Arbitrary Projects\nAuthors: Islem Bouzenia, Michael Pradel\nSummary: The ability to execute the test suite of a project is essential in many\nscenarios, e.g., to assess code quality and code coverage, to validate code\nchanges made by developers or automated tools, and to ensure compatibility with\ndependencies. Despite its importance, executing the test suite of a project can\nbe challenging in practice because different projects use different programming\nlanguages, software ecosystems, build systems, testing frameworks, and other\ntools. These challenges make it difficult to create a reliable, universal test\nexecution method that works across different projects. This paper presents\nExecutionAgent, an automated technique that installs arbitrary projects,\nconfigures them to run test cases, and produces project-specific scripts to\nreproduce the setup. Inspired by the way a human developer would address this\ntask, our approach is a large language model-based agent that autonomously\nexecutes commands and interacts with the host system. The agent uses\nmeta-prompting to gather guidelines on the latest technologies related to the\ngiven project, and it iteratively refines its process based on feedback from\nthe previous steps. Our evaluation applies ExecutionAgent to 50 open-source\nprojects that use 14 different programming languages and many different build\nand testing tools. The approach successfully executes the test suites of 33/55\nprojects, while matching the test results of ground truth test suite executions\nwith a deviation of only 7.5\\%. These results improve over the best previously\navailable technique by 6.6x. The costs imposed by the approach are reasonable,\nwith an execution time of 74 minutes and LLM costs of 0.16 dollars, on average\nper project. We envision ExecutionAgent to serve as a valuable tool for\ndevelopers, automated programming tools, and researchers that need to execute\ntests across a wide variety of projects.\nPublished: 2024-12-13 13:30:51+00:00\nJournal Reference: None\nDOI: None\nPrimary Category: cs.SE\nCategories: cs.SE, cs.AI\nPDF URL: http://arxiv.org/pdf/2412.10133v1\narXiv URL: http://arxiv.org/abs/2412.10133v1", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 2224, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "b7d9a32c-188d-497c-805d-e0e7b2b39b3e": {"__data__": {"id_": "b7d9a32c-188d-497c-805d-e0e7b2b39b3e", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "548c017c-00a3-4857-98bb-e9aa5c8182e4", "node_type": "4", "metadata": {}, "hash": "8e55790f5185aa0aa3775dc612146b7947b6e6525d8072dc583dfe7d6ce50340", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Title: ChatDyn: Language-Driven Multi-Actor Dynamics Generation in Street Scenes\nAuthors: Yuxi Wei, Jingbo Wang, Yuwen Du, Dingju Wang, Liang Pan, Chenxin Xu, Yao Feng, Bo Dai, Siheng Chen\nSummary: Generating realistic and interactive dynamics of traffic participants\naccording to specific instruction is critical for street scene simulation.\nHowever, there is currently a lack of a comprehensive method that generates\nrealistic dynamics of different types of participants including vehicles and\npedestrians, with different kinds of interactions between them. In this paper,\nwe introduce ChatDyn, the first system capable of generating interactive,\ncontrollable and realistic participant dynamics in street scenes based on\nlanguage instructions. To achieve precise control through complex language,\nChatDyn employs a multi-LLM-agent role-playing approach, which utilizes natural\nlanguage inputs to plan the trajectories and behaviors for different traffic\nparticipants. To generate realistic fine-grained dynamics based on the\nplanning, ChatDyn designs two novel executors: the PedExecutor, a unified\nmulti-task executor that generates realistic pedestrian dynamics under\ndifferent task plannings; and the VehExecutor, a physical transition-based\npolicy that generates physically plausible vehicle dynamics. Extensive\nexperiments show that ChatDyn can generate realistic driving scene dynamics\nwith multiple vehicles and pedestrians, and significantly outperforms previous\nmethods on subtasks. Code and model will be available at\nhttps://vfishc.github.io/chatdyn.\nPublished: 2024-12-11 18:58:48+00:00\nJournal Reference: None\nDOI: None\nPrimary Category: cs.CV\nCategories: cs.CV\nPDF URL: http://arxiv.org/pdf/2412.08685v1\narXiv URL: http://arxiv.org/abs/2412.08685v1", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 1764, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "02f51336-3954-4b74-b068-899e2322d368": {"__data__": {"id_": "02f51336-3954-4b74-b068-899e2322d368", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "a0b037f8-b3de-44b1-bb1a-7c82f59469e5", "node_type": "4", "metadata": {}, "hash": "390f50475780109546a85dbd6ff1da83be120abbff8ce533acd3e83987a5690f", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Title: TapeAgents: a Holistic Framework for Agent Development and Optimization\nAuthors: Dzmitry Bahdanau, Nicolas Gontier, Gabriel Huang, Ehsan Kamalloo, Rafael Pardinas, Alex Pich\u00e9, Torsten Scholak, Oleh Shliazhko, Jordan Prince Tremblay, Karam Ghanem, Soham Parikh, Mitul Tiwari, Quaizar Vohra\nSummary: We present TapeAgents, an agent framework built around a granular, structured\nlog tape of the agent session that also plays the role of the session's\nresumable state. In TapeAgents we leverage tapes to facilitate all stages of\nthe LLM Agent development lifecycle. The agent reasons by processing the tape\nand the LLM output to produce new thought and action steps and append them to\nthe tape. The environment then reacts to the agent's actions by likewise\nappending observation steps to the tape. By virtue of this tape-centred design,\nTapeAgents can provide AI practitioners with holistic end-to-end support. At\nthe development stage, tapes facilitate session persistence, agent auditing,\nand step-by-step debugging. Post-deployment, one can reuse tapes for\nevaluation, fine-tuning, and prompt-tuning; crucially, one can adapt tapes from\nother agents or use revised historical tapes. In this report, we explain the\nTapeAgents design in detail. We demonstrate possible applications of TapeAgents\nwith several concrete examples of building monolithic agents and multi-agent\nteams, of optimizing agent prompts and finetuning the agent's LLM. We present\ntooling prototypes and report a case study where we use TapeAgents to finetune\na Llama-3.1-8B form-filling assistant to perform as well as GPT-4o while being\norders of magnitude cheaper. Lastly, our comparative analysis shows that\nTapeAgents's advantages over prior frameworks stem from our novel design of the\nLLM agent as a resumable, modular state machine with a structured\nconfiguration, that generates granular, structured logs and that can transform\nthese logs into training text -- a unique combination of features absent in\nprevious work.\nPublished: 2024-12-11 15:09:54+00:00\nJournal Reference: None\nDOI: None\nPrimary Category: cs.AI\nCategories: cs.AI\nPDF URL: http://arxiv.org/pdf/2412.08445v1\narXiv URL: http://arxiv.org/abs/2412.08445v1", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 2203, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "364bc31a-a379-4c49-a666-a56ceb46705b": {"__data__": {"id_": "364bc31a-a379-4c49-a666-a56ceb46705b", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "f2aa086e-b546-4f44-8dba-69293c4cb9a5", "node_type": "4", "metadata": {}, "hash": "2c6a57699739f09affe34a1c18561bfba348306bf4792e7dca4163cdf7349e4d", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Title: Federated In-Context LLM Agent Learning\nAuthors: Panlong Wu, Kangshuo Li, Junbao Nan, Fangxin Wang\nSummary: Large Language Models (LLMs) have revolutionized intelligent services by\nenabling logical reasoning, tool use, and interaction with external systems as\nagents. The advancement of LLMs is frequently hindered by the scarcity of\nhigh-quality data, much of which is inherently sensitive. Federated learning\n(FL) offers a potential solution by facilitating the collaborative training of\ndistributed LLMs while safeguarding private data. However, FL frameworks face\nsignificant bandwidth and computational demands, along with challenges from\nheterogeneous data distributions. The emerging in-context learning capability\nof LLMs offers a promising approach by aggregating natural language rather than\nbulky model parameters. Yet, this method risks privacy leakage, as it\nnecessitates the collection and presentation of data samples from various\nclients during aggregation. In this paper, we propose a novel\nprivacy-preserving Federated In-Context LLM Agent Learning (FICAL) algorithm,\nwhich to our best knowledge for the first work unleashes the power of\nin-context learning to train diverse LLM agents through FL. In our design,\nknowledge compendiums generated by a novel LLM-enhanced Knowledge Compendiums\nGeneration (KCG) module are transmitted between clients and the server instead\nof model parameters in previous FL methods. Apart from that, an incredible\nRetrieval Augmented Generation (RAG) based Tool Learning and Utilizing (TLU)\nmodule is designed and we incorporate the aggregated global knowledge\ncompendium as a teacher to teach LLM agents the usage of tools. We conducted\nextensive experiments and the results show that FICAL has competitive\nperformance compared to other SOTA baselines with a significant communication\ncost decrease of $\\mathbf{3.33\\times10^5}$ times.\nPublished: 2024-12-11 03:00:24+00:00\nJournal Reference: None\nDOI: None\nPrimary Category: cs.LG\nCategories: cs.LG, cs.AI, cs.CL, cs.CR\nPDF URL: http://arxiv.org/pdf/2412.08054v1\narXiv URL: http://arxiv.org/abs/2412.08054v1", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 2113, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "73ed22dc-c89b-4d52-a35b-3b1a59ae4eb2": {"__data__": {"id_": "73ed22dc-c89b-4d52-a35b-3b1a59ae4eb2", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "37191111-9247-427c-a43b-7f1a6ff23a3a", "node_type": "4", "metadata": {}, "hash": "b2795736168ab74c76b5f60425ef8d80718f4f57dee8dcac4f824451393ae79a", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Title: MAGIC: Mastering Physical Adversarial Generation in Context through Collaborative LLM Agents\nAuthors: Yun Xing, Nhat Chung, Jie Zhang, Yue Cao, Ivor Tsang, Yang Liu, Lei Ma, Qing Guo\nSummary: Physical adversarial attacks in driving scenarios can expose critical\nvulnerabilities in visual perception models. However, developing such attacks\nremains challenging due to diverse real-world backgrounds and the requirement\nfor maintaining visual naturality. Building upon this challenge, we reformulate\nphysical adversarial attacks as a one-shot patch-generation problem. Our\napproach generates adversarial patches through a deep generative model that\nconsiders the specific scene context, enabling direct physical deployment in\nmatching environments. The primary challenge lies in simultaneously achieving\ntwo objectives: generating adversarial patches that effectively mislead object\ndetection systems while determining contextually appropriate placement within\nthe scene. We propose MAGIC (Mastering Physical Adversarial Generation In\nContext), a novel framework powered by multi-modal LLM agents to address these\nchallenges. MAGIC automatically understands scene context and orchestrates\nadversarial patch generation through the synergistic interaction of language\nand vision capabilities. MAGIC orchestrates three specialized LLM agents: The\nadv-patch generation agent (GAgent) masters the creation of deceptive patches\nthrough strategic prompt engineering for text-to-image models. The adv-patch\ndeployment agent (DAgent) ensures contextual coherence by determining optimal\nplacement strategies based on scene understanding. The self-examination agent\n(EAgent) completes this trilogy by providing critical oversight and iterative\nrefinement of both processes. We validate our method on both digital and\nphysical level, \\ie, nuImage and manually captured real scenes, where both\nstatistical and visual results prove that our MAGIC is powerful and effectively\nfor attacking wide-used object detection systems.\nPublished: 2024-12-11 01:41:19+00:00\nJournal Reference: None\nDOI: None\nPrimary Category: cs.CV\nCategories: cs.CV, cs.AI\nPDF URL: http://arxiv.org/pdf/2412.08014v1\narXiv URL: http://arxiv.org/abs/2412.08014v1", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 2223, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "b2dd87cb-2c16-4133-88fd-11dd778b54d8": {"__data__": {"id_": "b2dd87cb-2c16-4133-88fd-11dd778b54d8", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "4d507172-589c-4f01-9292-492b28b20cce", "node_type": "4", "metadata": {}, "hash": "a2510024218bbdd1221a5ffa5eb6f97e06297bee1cdcbe3abfe2df0c61bfd4f6", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Title: MAGE: A Multi-Agent Engine for Automated RTL Code Generation\nAuthors: Yujie Zhao, Hejia Zhang, Hanxian Huang, Zhongming Yu, Jishen Zhao\nSummary: The automatic generation of RTL code (e.g., Verilog) through natural language\ninstructions has emerged as a promising direction with the advancement of large\nlanguage models (LLMs). However, producing RTL code that is both syntactically\nand functionally correct remains a significant challenge. Existing\nsingle-LLM-agent approaches face substantial limitations because they must\nnavigate between various programming languages and handle intricate generation,\nverification, and modification tasks. To address these challenges, this paper\nintroduces MAGE, the first open-source multi-agent AI system designed for\nrobust and accurate Verilog RTL code generation. We propose a novel\nhigh-temperature RTL candidate sampling and debugging system that effectively\nexplores the space of code candidates and significantly improves the quality of\nthe candidates. Furthermore, we design a novel Verilog-state checkpoint\nchecking mechanism that enables early detection of functional errors and\ndelivers precise feedback for targeted fixes, significantly enhancing the\nfunctional correctness of the generated RTL code. MAGE achieves a 95.7% rate of\nsyntactic and functional correctness code generation on VerilogEval-Human 2\nbenchmark, surpassing the state-of-the-art Claude-3.5-sonnet by 23.3 %,\ndemonstrating a robust and reliable approach for AI-driven RTL design\nworkflows.\nPublished: 2024-12-10 21:53:55+00:00\nJournal Reference: None\nDOI: None\nPrimary Category: cs.AR\nCategories: cs.AR, cs.LG\nPDF URL: http://arxiv.org/pdf/2412.07822v1\narXiv URL: http://arxiv.org/abs/2412.07822v1", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 1724, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "894fe9ba-ff1c-4dac-ad2e-e7c3a990814c": {"__data__": {"id_": "894fe9ba-ff1c-4dac-ad2e-e7c3a990814c", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "7e1a96ce-5cb2-47f7-8971-90428d7e4e2a", "node_type": "4", "metadata": {}, "hash": "a251b7a4276eb22942398151ee4c105631f321688f387dc5c94fd9dda10e6dfc", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Title: Searching for Structure: Investigating Emergent Communication with Large Language Models\nAuthors: Tom Kouwenhoven, Max Peeperkorn, Tessa Verhoef\nSummary: Human languages have evolved to be structured through repeated language\nlearning and use. These processes introduce biases that operate during language\nacquisition and shape linguistic systems toward communicative efficiency. In\nthis paper, we investigate whether the same happens if artificial languages are\noptimised for implicit biases of Large Language Models (LLMs). To this end, we\nsimulate a classical referential game in which LLMs learn and use artificial\nlanguages. Our results show that initially unstructured holistic languages are\nindeed shaped to have some structural properties that allow two LLM agents to\ncommunicate successfully. Similar to observations in human experiments,\ngenerational transmission increases the learnability of languages, but can at\nthe same time result in non-humanlike degenerate vocabularies. Taken together,\nthis work extends experimental findings, shows that LLMs can be used as tools\nin simulations of language evolution, and opens possibilities for future\nhuman-machine experiments in this field.\nPublished: 2024-12-10 16:32:19+00:00\nJournal Reference: None\nDOI: None\nPrimary Category: cs.CL\nCategories: cs.CL\nPDF URL: http://arxiv.org/pdf/2412.07646v3\narXiv URL: http://arxiv.org/abs/2412.07646v3", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 1404, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}, "aba7eaaf-67eb-4168-825e-9c2cc802e04d": {"__data__": {"id_": "aba7eaaf-67eb-4168-825e-9c2cc802e04d", "embedding": null, "metadata": {}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": [], "relationships": {"1": {"node_id": "e3a3f0cc-7af7-4746-820c-6ce4a054fb49", "node_type": "4", "metadata": {}, "hash": "5e5fd1f3262954d70dececa0a8c62877a2ceb59b672656b04a3f68a9650938ef", "class_name": "RelatedNodeInfo"}}, "metadata_template": "{key}: {value}", "metadata_separator": "\n", "text": "Title: AutoDCWorkflow: LLM-based Data Cleaning Workflow Auto-Generation and Benchmark\nAuthors: Lan Li, Liri Fang, Vetle I. Torvik\nSummary: We investigate the reasoning capabilities of large language models (LLMs) for\nautomatically generating data-cleaning workflows. To evaluate LLMs' ability to\ncomplete data-cleaning tasks, we implemented a pipeline for LLM-based Auto Data\nCleaning Workflow (AutoDCWorkflow), prompting LLMs on data cleaning operations\nto repair three types of data quality issues: duplicates, missing values, and\ninconsistent data formats. Given a dirty table and a purpose (expressed as a\nquery), this pipeline generates a minimal, clean table sufficient to address\nthe purpose and the data cleaning workflow used to produce the table. The\nplanning process involves three main LLM-driven components: (1) Select Target\nColumns: Identifies a set of target columns related to the purpose. (2) Inspect\nColumn Quality: Assesses the data quality for each target column and generates\na Data Quality Report as operation objectives. (3) Generate Operation &\nArguments: Predicts the next operation and arguments based on the data quality\nreport results. Additionally, we propose a data cleaning benchmark to evaluate\nthe capability of LLM agents to automatically generate workflows that address\ndata cleaning purposes of varying difficulty levels. The benchmark comprises\nthe annotated datasets as a collection of purpose, raw table, clean table, data\ncleaning workflow, and answer set. In our experiments, we evaluated three LLMs\nthat auto-generate purpose-driven data cleaning workflows. The results indicate\nthat LLMs perform well in planning and generating data-cleaning workflows\nwithout the need for fine-tuning.\nPublished: 2024-12-09 18:13:27+00:00\nJournal Reference: None\nDOI: None\nPrimary Category: cs.DB\nCategories: cs.DB, cs.CL\nPDF URL: http://arxiv.org/pdf/2412.06724v2\narXiv URL: http://arxiv.org/abs/2412.06724v2", "mimetype": "text/plain", "start_char_idx": 0, "end_char_idx": 1937, "metadata_seperator": "\n", "text_template": "{metadata_str}\n\n{content}", "class_name": "TextNode"}, "__type__": "1"}}, "docstore/ref_doc_info": {"6bd29218-fa1f-41b9-801c-4a309986263c": {"node_ids": ["e705f9a7-c5f5-4c0b-941f-b1d9d5301b3e"], "metadata": {}}, "044a8f5d-3e9b-452b-81e0-dd7fb55823a8": {"node_ids": ["5060d247-f056-4a31-b81d-28a5dea38624"], "metadata": {}}, "6731a091-04a0-42ab-b648-f71f1b3f4e83": {"node_ids": ["12aec01a-74aa-474b-8c31-5ebfc27d3234"], "metadata": {}}, "478cc06b-9553-4eea-96a1-0134b150348a": {"node_ids": ["e8afa57e-0f27-483c-8f64-c2e198cfe7c0"], "metadata": {}}, "da6c62ab-1a0f-441e-914a-187cc01a2daa": {"node_ids": ["fdad2622-2190-4427-9067-c31534e49d59"], "metadata": {}}, "90338cfe-1bf2-4827-932a-096909ab9872": {"node_ids": ["71eaa072-dc72-4228-84a0-6935bfce3f98"], "metadata": {}}, "55b6a4bb-4ea6-4be6-abe5-13c3836c9b45": {"node_ids": ["e4fe9c90-10d4-4cd5-a12d-51d7905bbb33"], "metadata": {}}, "a30ff13f-7b36-4e5a-9136-78a519486858": {"node_ids": ["b646165e-9c51-4eff-a62b-4f37e46872b9"], "metadata": {}}, "13cf01d1-2f55-4058-9894-15fdc7e35c53": {"node_ids": ["c29e002e-43e1-44bf-b179-63a039627571"], "metadata": {}}, "c8957502-16b6-4315-ae07-380d414096aa": {"node_ids": ["414c6224-f841-47a4-8c80-558c49018337"], "metadata": {}}, "040471c5-5c98-4579-9b44-8e3341a925a8": {"node_ids": ["1091bcbd-fd6d-4039-aadd-b241ceff6b7a"], "metadata": {}}, "6a3c28fb-12c5-40a9-b031-48d4cd59dfde": {"node_ids": ["71297e5e-93ef-4382-a45b-5f43c5f36ade"], "metadata": {}}, "47d16afe-8908-4bcf-a028-8d172369f372": {"node_ids": ["72f77e9a-1096-48b5-a14b-01422cff5880"], "metadata": {}}, "548c017c-00a3-4857-98bb-e9aa5c8182e4": {"node_ids": ["b7d9a32c-188d-497c-805d-e0e7b2b39b3e"], "metadata": {}}, "a0b037f8-b3de-44b1-bb1a-7c82f59469e5": {"node_ids": ["02f51336-3954-4b74-b068-899e2322d368"], "metadata": {}}, "f2aa086e-b546-4f44-8dba-69293c4cb9a5": {"node_ids": ["364bc31a-a379-4c49-a666-a56ceb46705b"], "metadata": {}}, "37191111-9247-427c-a43b-7f1a6ff23a3a": {"node_ids": ["73ed22dc-c89b-4d52-a35b-3b1a59ae4eb2"], "metadata": {}}, "4d507172-589c-4f01-9292-492b28b20cce": {"node_ids": ["b2dd87cb-2c16-4133-88fd-11dd778b54d8"], "metadata": {}}, "7e1a96ce-5cb2-47f7-8971-90428d7e4e2a": {"node_ids": ["894fe9ba-ff1c-4dac-ad2e-e7c3a990814c"], "metadata": {}}, "e3a3f0cc-7af7-4746-820c-6ce4a054fb49": {"node_ids": ["aba7eaaf-67eb-4168-825e-9c2cc802e04d"], "metadata": {}}}}