#Package installation #!pip install git+https://github.com/huggingface/transformers.git #!pip install torch, accelerate, bitsandbyte, sentencepiece, pillow #!pip install spaces import gradio as gr import os import torch from transformers import AutoProcessor, MllamaForConditionalGeneration, TextStreamer from PIL import Image import csv import spaces # Check if we're running in a Hugging Face Space and if SPACES_ZERO_GPU is enabled IS_SPACES_ZERO = os.environ.get("SPACES_ZERO_GPU", "0") == "1" IS_SPACE = os.environ.get("SPACE_ID", None) is not None IS_GDRVIE = False # Determine the device (GPU if available, else CPU) device = "cuda" if torch.cuda.is_available() else "cpu" LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1" print(f"Using device: {device}") print(f"Low memory mode: {LOW_MEMORY}") # Get Hugging Face token from environment variables HF_TOKEN = os.environ.get('HF_TOKEN') # Define the model name model_name = "Llama-3.2-11B-Vision-Instruct" if IS_GDRVIE: # Define the path to the model directory in your Google Drive model_path = "/content/drive/MyDrive/models/" + model_name model = MllamaForConditionalGeneration.from_pretrained( model_path, torch_dtype=torch.bfloat16, device_map="auto", ) processor = AutoProcessor.from_pretrained(model_path) else: # Get Hugging Face token from environment variables HF_TOKEN = os.environ.get('HF_TOKEN') # Load the model and processor model_name = "ruslanmv/Llama-3.2-11B-Vision-Instruct" model = MllamaForConditionalGeneration.from_pretrained( model_name, use_auth_token=HF_TOKEN, torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32, device_map="auto" if device == "cuda" else None, # Use device mapping if CUDA is available ) # Move the model to the appropriate device (GPU if available) model.to(device) processor = AutoProcessor.from_pretrained(model_name, use_auth_token=HF_TOKEN) # Tie the model weights to ensure the model is properly loaded if hasattr(model, "tie_weights"): model.tie_weights() example = '''Table 1: header1,header2,header3 value1,value2,value3 Table 2: header1,header2,header3 value1,value2,value3 ''' prompt_message = """Please extract all tables from the image and generate CSV files. Each table should be separated using the format table_n.csv, where n is the table number. You must use CSV format with commas as the delimiter. Do not use markdown format. Ensure you use the original table headers and content from the image. Only answer with the CSV content. Dont explain the tables. An example of the formatting output is as follows: """ + example # Stream LLM response generator def stream_response(inputs): streamer = TextStreamer(tokenizer=processor.tokenizer) for token in model.generate(**inputs, max_new_tokens=2000, do_sample=True, streamer=streamer): yield processor.decode(token, skip_special_tokens=True) @spaces.GPU # Use the free GPU provided by Hugging Face Spaces # Predict function for Gradio app def predict(message, image): # Prepare the input messages messages = [ {"role": "user", "content": [ {"type": "image"}, # Specify that an image is provided {"type": "text", "text": message} # Add the user-provided text input ]} ] # Create the input text using the processor's chat template input_text = processor.apply_chat_template(messages, add_generation_prompt=True) # Process the inputs and move to the appropriate device inputs = processor(image, input_text, return_tensors="pt").to(device) # Return a streaming generator of responses full_response = "" for response in stream_response(inputs): # print(response, end="", flush=True) # Print each part of the response as it's generated full_response += response return extract_and_save_tables(full_response) # Extract tables and save them to CSV files_list = [] def clean_full_response(full_response): """Cleans the full response by removing the prompt input before the tables.""" # The part of the prompt input to remove message_to_remove = prompt_message # Remove the message and return only the tables return full_response.replace(message_to_remove, "").strip() def extract_and_save_tables(full_response): """Extracts CSV tables from the cleaned_response string and saves them as separate files.""" cleaned_response = clean_full_response(full_response) files_list = [] # Initialize the list of file names tables = cleaned_response.split("Table ") # Split the response by table sections for i, table in enumerate(tables[1:], start=1): # Start with index 1 for "Table 1" table_name = f"table_{i}.csv" # File name for the current table rows = table.strip().splitlines()[1:] # Remove "Table n:" line and split the table into rows rows = [row.replace('"', '').split(",") for row in rows if row.strip()] # Clean and split by commas # Save the table as a CSV file with open(table_name, mode="w", newline='') as file: writer = csv.writer(file) writer.writerows(rows) files_list.append(table_name) # Append the saved file to the list return files_list # Gradio interface def gradio_app(): def process_image(image): message = prompt_message files = predict(message, image) return "Tables extracted and saved as CSV files.", files # Input components image_input = gr.Image(type="pil", label="Upload Image") #message_input = gr.Textbox(lines=2, placeholder="Enter your message", value=message) output_text = gr.Textbox(label="Extraction Status") file_output = gr.File(label="Download CSV files") # Gradio interface iface = gr.Interface( fn=process_image, inputs=[image_input], outputs=[output_text, file_output], title="Table Extractor and CSV Converter", description="Upload an image to extract tables and download CSV files.", allow_flagging="never" ) iface.launch(debug=True) # Call the Gradio app function to launch the app gradio_app()