whyumesh commited on
Commit
ebe2332
·
verified ·
1 Parent(s): 56888a5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +169 -4
app.py CHANGED
@@ -1,7 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import (
3
+ Qwen2VLForConditionalGeneration,
4
+ AutoProcessor,
5
+ AutoModelForCausalLM,
6
+ AutoTokenizer
7
+ )
8
+ from qwen_vl_utils import process_vision_info
9
+ from PIL import Image
10
+ import cv2
11
+ import numpy as np
12
  import gradio as gr
13
+ import spaces
14
 
15
+ # Load both models and their processors/tokenizers
16
+ def load_models():
17
+ # Vision model
18
+ vision_model = Qwen2VLForConditionalGeneration.from_pretrained(
19
+ "Qwen/Qwen2-VL-2B-Instruct",
20
+ torch_dtype=torch.float16,
21
+ device_map="auto"
22
+ )
23
+ vision_processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
24
+
25
+ # Code model
26
+ code_model = AutoModelForCausalLM.from_pretrained(
27
+ "Qwen/Qwen2.5-Coder-1.5B-Instruct",
28
+ torch_dtype=torch.float16,
29
+ device_map="auto"
30
+ )
31
+ code_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-1.5B-Instruct")
32
+
33
+ return vision_model, vision_processor, code_model, code_tokenizer
34
 
35
+ vision_model, vision_processor, code_model, code_tokenizer = load_models()
36
+
37
+ VISION_SYSTEM_PROMPT = """You are an AI assistant specialized in analyzing images and videos of code editors. Your task is to:
38
+ 1. Extract and describe any code snippets visible in the image
39
+ 2. Identify any error messages, warnings, or highlighting that indicates bugs
40
+ 3. Describe the programming language and context if visible
41
+ Be thorough and accurate in your description, as this will be used to fix the code."""
42
+
43
+ CODE_SYSTEM_PROMPT = """You are an expert code debugging assistant. Based on the description of code and errors provided, your task is to:
44
+ 1. Identify the bugs and issues in the code
45
+ 2. Provide a corrected version of the code
46
+ 3. Explain the fixes made and why they resolve the issues
47
+ Be thorough in your explanation and ensure the corrected code is complete and functional."""
48
+
49
+ def process_image_for_code(image):
50
+ # First, process with vision model
51
+ vision_messages = [
52
+ {
53
+ "role": "user",
54
+ "content": [
55
+ {"type": "image", "image": image},
56
+ {"type": "text", "text": f"{VISION_SYSTEM_PROMPT}\n\nDescribe the code and any errors you see in this image."},
57
+ ],
58
+ }
59
+ ]
60
+
61
+ vision_text = vision_processor.apply_chat_template(
62
+ vision_messages,
63
+ tokenize=False,
64
+ add_generation_prompt=True
65
+ )
66
+ image_inputs, video_inputs = process_vision_info(vision_messages)
67
+
68
+ vision_inputs = vision_processor(
69
+ text=[vision_text],
70
+ images=image_inputs,
71
+ videos=video_inputs,
72
+ padding=True,
73
+ return_tensors="pt",
74
+ ).to(vision_model.device)
75
+
76
+ with torch.no_grad():
77
+ vision_output_ids = vision_model.generate(**vision_inputs, max_new_tokens=512)
78
+ vision_output_trimmed = [
79
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(vision_inputs.input_ids, vision_output_ids)
80
+ ]
81
+ vision_description = vision_processor.batch_decode(
82
+ vision_output_trimmed,
83
+ skip_special_tokens=True,
84
+ clean_up_tokenization_spaces=False
85
+ )[0]
86
+
87
+ # Then, use code model to fix the code
88
+ code_messages = [
89
+ {"role": "system", "content": CODE_SYSTEM_PROMPT},
90
+ {"role": "user", "content": f"Here's a description of code with errors:\n\n{vision_description}\n\nPlease analyze and fix the code."}
91
+ ]
92
+
93
+ code_text = code_tokenizer.apply_chat_template(
94
+ code_messages,
95
+ tokenize=False,
96
+ add_generation_prompt=True
97
+ )
98
+
99
+ code_inputs = code_tokenizer([code_text], return_tensors="pt").to(code_model.device)
100
+
101
+ with torch.no_grad():
102
+ code_output_ids = code_model.generate(
103
+ **code_inputs,
104
+ max_new_tokens=1024,
105
+ temperature=0.7,
106
+ top_p=0.95,
107
+ )
108
+
109
+ code_output_trimmed = [
110
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(code_inputs.input_ids, code_output_ids)
111
+ ]
112
+ fixed_code_response = code_tokenizer.batch_decode(
113
+ code_output_trimmed,
114
+ skip_special_tokens=True
115
+ )[0]
116
+
117
+ return vision_description, fixed_code_response
118
+
119
+ def process_video_for_code(video_path, max_frames=16, frame_interval=30):
120
+ cap = cv2.VideoCapture(video_path)
121
+ frames = []
122
+ frame_count = 0
123
+
124
+ while len(frames) < max_frames:
125
+ ret, frame = cap.read()
126
+ if not ret:
127
+ break
128
+
129
+ if frame_count % frame_interval == 0:
130
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
131
+ frame = Image.fromarray(frame)
132
+ frames.append(frame)
133
+
134
+ frame_count += 1
135
+
136
+ cap.release()
137
+
138
+ # Process the first frame for now (you could extend this to handle multiple frames)
139
+ if frames:
140
+ return process_image_for_code(frames[0])
141
+ else:
142
+ return "No frames could be extracted from the video.", "No code could be analyzed."
143
+
144
+ @spaces.GPU
145
+ def process_content(content):
146
+ if content is None:
147
+ return "Please upload an image or video file of code with errors.", ""
148
+
149
+ if content.name.lower().endswith(('.png', '.jpg', '.jpeg')):
150
+ image = Image.open(content.name)
151
+ vision_output, code_output = process_image_for_code(image)
152
+ elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
153
+ vision_output, code_output = process_video_for_code(content.name)
154
+ else:
155
+ return "Unsupported file type. Please provide an image or video file.", ""
156
+
157
+ return vision_output, code_output
158
+
159
+ # Gradio interface
160
+ iface = gr.Interface(
161
+ fn=process_content,
162
+ inputs=gr.File(label="Upload Image or Video of Code with Errors"),
163
+ outputs=[
164
+ gr.Textbox(label="Vision Model Output (Code Description)"),
165
+ gr.Code(label="Fixed Code", language="python")
166
+ ],
167
+ title="Vision Code Debugger",
168
+ description="Upload an image or video of code with errors, and the AI will analyze and fix the issues."
169
+ )
170
+
171
+ if __name__ == "__main__":
172
+ iface.launch()