ParamTh commited on
Commit
1d8564b
·
1 Parent(s): ed4adf0

Add application file

Browse files
Files changed (1) hide show
  1. app.py +69 -0
app.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PIL import Image
3
+ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
4
+ from qwen_vl_utils import process_vision_info
5
+
6
+ def load_model_and_processor():
7
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
8
+ "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"
9
+ )
10
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
11
+ return model, processor
12
+
13
+ st.title('Image OCR and RAG')
14
+
15
+ with st.sidebar:
16
+ st.header("Upload your image")
17
+ uploaded_file = st.file_uploader("Upload an image...", type=["jpg", "jpeg", "png"])
18
+ if uploaded_file is not None:
19
+ st.success("Image uploaded successfully!")
20
+
21
+ model, processor = load_model_and_processor()
22
+
23
+ if uploaded_file is not None:
24
+ image = Image.open(uploaded_file)
25
+ st.image(image, caption="Uploaded Image", use_column_width=True)
26
+
27
+ try:
28
+ messages = [
29
+ {
30
+ "role": "user",
31
+ "content": [
32
+ {
33
+ "type": "image",
34
+ "image": image,
35
+ },
36
+ {"type": "text", "text": "Extract all the text present in the image and give the output in JSON format"},
37
+ ],
38
+ }
39
+ ]
40
+
41
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
42
+ image_inputs, video_inputs = process_vision_info(messages)
43
+ inputs = processor(
44
+ text=[text],
45
+ images=image_inputs,
46
+ videos=video_inputs,
47
+ padding=True,
48
+ return_tensors="pt",
49
+ )
50
+ inputs = inputs.to("cpu")
51
+
52
+ # Generate output using the model
53
+ generated_ids = model.generate(**inputs, max_new_tokens=300)
54
+ generated_ids_trimmed = [
55
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
56
+ ]
57
+ output_text = processor.batch_decode(
58
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
59
+ )
60
+
61
+ # Display the extracted text in JSON format
62
+ st.subheader("Extracted Text in JSON Format:")
63
+ st.json(output_text[0])
64
+
65
+ except Exception as e:
66
+ st.error(f"An error occurred: {str(e)}")
67
+
68
+ else:
69
+ st.write("Please upload an image from the sidebar")