Quantization made by Richard Erkhov.

Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit - GGUF

Model creator: https://huggingface.co/Agnuxo/
Original model: https://huggingface.co/Agnuxo/Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit/

Name	Quant method	Size
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.Q2_K.gguf	Q2_K	0.63GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.IQ3_XS.gguf	IQ3_XS	0.68GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.IQ3_S.gguf	IQ3_S	0.71GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.Q3_K_S.gguf	Q3_K_S	0.71GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.IQ3_M.gguf	IQ3_M	0.72GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.Q3_K.gguf	Q3_K	0.77GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.Q3_K_M.gguf	Q3_K_M	0.77GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.Q3_K_L.gguf	Q3_K_L	0.82GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.IQ4_XS.gguf	IQ4_XS	0.84GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.Q4_0.gguf	Q4_0	0.87GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.IQ4_NL.gguf	IQ4_NL	0.88GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.Q4_K_S.gguf	Q4_K_S	0.88GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.Q4_K.gguf	Q4_K	0.92GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.Q4_K_M.gguf	Q4_K_M	0.92GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.Q4_1.gguf	Q4_1	0.95GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.Q5_0.gguf	Q5_0	1.02GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.Q5_K_S.gguf	Q5_K_S	1.02GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.Q5_K.gguf	Q5_K	1.05GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.Q5_K_M.gguf	Q5_K_M	1.05GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.Q5_1.gguf	Q5_1	1.1GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.Q6_K.gguf	Q6_K	1.19GB
Qwen2-1.5B-Instruct_MOE_CODE_assistant_16bit.Q8_0.gguf	Q8_0	1.53GB

Original model description:

base_model: Agnuxo/Qwen2-1.5B-Instruct_MOE_assistant_16bit language: - en license: apache-2.0 tags: - text-generation-inference - transformers - unsloth - qwen2 - trl - sft

Uploaded model

Developed by: Agnuxo
License: apache-2.0
Finetuned from model : Agnuxo/Qwen2-1.5B-Instruct_MOE_assistant_16bit

This qwen2 model was trained 2x faster with Unsloth and Huggingface's TRL library.

How the MOE System Works

This model is a core component of a larger Multi-Expert Question Answering System. Here's a breakdown of the system's functionality:

Model Loading: The system loads the "director" LLM and keeps other expert LLMs (e.g., for programming, biology, mathematics) ready for use.
Expert Routing: When a user asks a question, the system either:
- Uses keyword matching to identify the relevant domain.
- Consults the director LLM to classify the question's category.
Dynamic Expert Loading: The system loads the chosen expert LLM into memory, optimizing resource usage by releasing any previously active expert.
Response Generation: The selected expert LLM receives the question and generates a tailored answer.
Chat Interface: A user-friendly chat interface facilitates interaction with the MOE system.

This MOE approach enhances efficiency and accuracy compared to relying on a single, general-purpose LLM.

Repository and Additional Information Full Code: https://huggingface.co/Agnuxo/Qwen2-1.5B-Instruct_MOE_Director_16bit/resolve/main/MOE-LLMs3.py GitHub Repository: https://github.com/Agnuxo1/NEBULA

Code Example

The following code demonstrates the implementation of the Multi-Expert Question Answering System:

import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

MODEL_CONFIG = {
    "director": {
        "name": "Agnuxo/Qwen2-1.5B-Instruct_MOE_Director_16bit",
        "task": "text-generation",
    },
    "programming": {
        "name": "Qwen/Qwen2-1.5B-Instruct",
        "task": "text-generation",
    },
    "biology": {
        "name": "Agnuxo/Qwen2-1.5B-Instruct_MOE_BIOLOGY_assistant_16bit",
        "task": "text-generation",
    },
    "mathematics": {
        "name": "Qwen/Qwen2-Math-1.5B-Instruct",
        "task": "text-generation",
    }
}


KEYWORDS = {
    "biology": ["cell", "DNA", "protein", "evolution", "genetics", "ecosystem", "organism", "metabolism", "photosynthesis", "microbiology", "célula", "ADN", "proteína", "evolución", "genética", "ecosistema", "organismo", "metabolismo", "fotosíntesis", "microbiología"],
    "mathematics": ["Math" "mathematics", "equation", "integral", "derivative", "function", "geometry", "algebra", "statistics", "probability", "ecuación", "integral", "derivada", "función", "geometría", "álgebra", "estadística", "probabilidad"],
    "programming": ["python", "java", "C++", "HTML", "scrip", "code", "Dataset", "API", "framework", "debugging", "algorithm", "compiler", "database", "CSS", "JSON", "XML", "encryption", "IDE", "repository", "Git", "version control", "front-end", "back-end", "API", "stack trace", "REST", "machine learning"]
}

class MOELLM:
    def __init__(self):
        self.current_expert = None
        self.current_model = None
        self.current_tokenizer = None
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")
        self.load_director_model()

    def load_director_model(self):
        """Loads the director model."""
        print("Loading director model...")
        model_name = MODEL_CONFIG["director"]["name"]
        self.director_tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.director_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(self.device)
        self.director_pipeline = pipeline(
            MODEL_CONFIG["director"]["task"],
            model=self.director_model,
            tokenizer=self.director_tokenizer,
            device=self.device
        )
        print("Director model loaded.")

    def load_expert_model(self, expert):
        """Dynamically loads an expert model, releasing memory from the previous model."""
        if expert not in MODEL_CONFIG:
            raise ValueError(f"Unknown expert: {expert}")

        if self.current_expert != expert:
            print(f"Loading expert model: {expert}...")
            
            # Free memory from the current model if it exists
            if self.current_model:
                del self.current_model
                del self.current_tokenizer
                torch.cuda.empty_cache()
            
            model_config = MODEL_CONFIG[expert]
            self.current_tokenizer = AutoTokenizer.from_pretrained(model_config["name"])
            self.current_model = AutoModelForCausalLM.from_pretrained(model_config["name"], torch_dtype=torch.float16).to(self.device)
            self.current_expert = expert
            
            print(f"{expert.capitalize()} model loaded.")
        
        return pipeline(
            MODEL_CONFIG[expert]["task"],
            model=self.current_model,
            tokenizer=self.current_tokenizer,
            device=self.device
        )

    def determine_expert_by_keywords(self, question):
        """Determines the expert based on keywords in the question."""
        question_lower = question.lower()
        for expert, keywords in KEYWORDS.items():
            if any(keyword in question_lower for keyword in keywords):
                return expert
        return None

    def determine_expert(self, question):
        """Determines which expert should answer the question."""
        expert = self.determine_expert_by_keywords(question)
        if expert:
            print(f"Expert determined by keyword: {expert}")
            return expert

        prompt = f"Classify the following question into one of these categories: programming, biology, mathematics. Question: {question}\nCategory:"
        response = self.director_pipeline(prompt, max_length=100, num_return_sequences=1)[0]['generated_text']
        expert = response.split(":")[-1].strip().lower()
        if expert not in MODEL_CONFIG:
            expert = "director"
        print(f"Redirecting question to: {expert}")
        return expert

    def generate_response(self, question, expert):
        """Generates a response using the appropriate model."""
        try:
            model = self.load_expert_model(expert)
            prompt = f"Answer the following question as an expert in {expert}: {question}\nAnswer:"
            response = model(prompt, max_length=200, num_return_sequences=1)[0]['generated_text']
            return response.split("Answer:")[-1].strip()
        except Exception as e:
            print(f"Error generating response: {str(e)}")
            return "Sorry, there was an error processing your request. Please try again."

    def chat_interface(self):
        """Simple chat interface."""
        print("Welcome to the MOE-LLM chat. Type 'exit' to quit.")
        while True:
            question = input("\nYou: ")
            if question.lower() in ['exit', 'quit']:
                break
            
            try:
                expert = self.determine_expert(question)
                response = self.generate_response(question, expert)
                print(f"\n{expert.capitalize()}: {response}")
            except Exception as e:
                print(f"Error in chat: {str(e)}")
                print("Please try asking another question.")

if __name__ == "__main__":
    moe_llm = MOELLM()
    moe_llm.chat_interface()