Spaces:

aolko
/

describe-test

Build error

App Files Files Community

aolko commited on Jul 1, 2024

Commit

6eac492

verified ·

1 Parent(s): 55ff40c

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -23

app.py CHANGED Viewed

@@ -11,13 +11,13 @@ from huggingface_hub import hf_hub_download
 # Initialize models
 anime_model_path = hf_hub_download("SmilingWolf/wd-convnext-tagger-v3", "model.onnx")
 anime_model = ort.InferenceSession(anime_model_path)
-photo_model = AutoModelForZeroShotImageClassification.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
 processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
 # Load labels for the anime model
 labels_path = hf_hub_download("SmilingWolf/wd-convnext-tagger-v3", "selected_tags.csv")
 with open(labels_path, 'r') as f:
-    labels = [line.strip().split(',')[0] for line in f.readlines()[1:]]  # Skip header
 def preprocess_image(image):
     image = image.convert('RGB')
@@ -28,6 +28,39 @@ def preprocess_image(image):
     image = image / 255.0
     return image[np.newaxis, ...]
 def get_booru_image(booru, image_id):
     if booru == "Gelbooru":
         url = f"https://gelbooru.com/index.php?page=dapi&s=post&q=index&json=1&id={image_id}"
@@ -51,27 +84,6 @@ def get_booru_image(booru, image_id):
     return img, tags
-def transcribe_image(image, image_type, transcriber, booru_tags=None):
-    if image_type == "Anime":
-        input_image = preprocess_image(image)
-        input_name = anime_model.get_inputs()[0].name
-        output_name = anime_model.get_outputs()[0].name
-        probs = anime_model.run([output_name], {input_name: input_image})[0]
-        # Get top 50 tags
-        top_indices = probs[0].argsort()[-50:][::-1]
-        tags = [labels[i] for i in top_indices]
-    else:
-        inputs = processor(images=image, return_tensors="pt")
-        outputs = photo_model(**inputs)
-        tags = outputs.logits.topk(50).indices.squeeze().tolist()
-        tags = [processor.config.id2label[t] for t in tags]
-    if booru_tags:
-        tags = list(set(tags + booru_tags))
-    return ", ".join(tags)
 def update_image(image_type, booru, image_id, uploaded_image):
     if image_type == "Anime" and booru != "Upload":
         image, booru_tags = get_booru_image(booru, image_id)

 # Initialize models
 anime_model_path = hf_hub_download("SmilingWolf/wd-convnext-tagger-v3", "model.onnx")
 anime_model = ort.InferenceSession(anime_model_path)
+photo_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
 processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
 # Load labels for the anime model
 labels_path = hf_hub_download("SmilingWolf/wd-convnext-tagger-v3", "selected_tags.csv")
 with open(labels_path, 'r') as f:
+    anime_labels = [line.strip().split(',')[0] for line in f.readlines()[1:]]  # Skip header
 def preprocess_image(image):
     image = image.convert('RGB')
     image = image / 255.0
     return image[np.newaxis, ...]
+def transcribe_image(image, image_type, transcriber, booru_tags=None):
+    if image_type == "Anime":
+        input_image = preprocess_image(image)
+        input_name = anime_model.get_inputs()[0].name
+        output_name = anime_model.get_outputs()[0].name
+        probs = anime_model.run([output_name], {input_name: input_image})[0]
+        # Get top 50 tags
+        top_indices = probs[0].argsort()[-50:][::-1]
+        tags = [anime_labels[i] for i in top_indices]
+    else:
+        prompt = "<MORE_DETAILED_CAPTION>"
+        inputs = processor(text=prompt, images=image, return_tensors="pt")
+        generated_ids = photo_model.generate(
+            input_ids=inputs["input_ids"],
+            pixel_values=inputs["pixel_values"],
+            max_new_tokens=1024,
+            do_sample=False,
+            num_beams=3,
+        )
+        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+        parsed_answer = processor.post_process_generation(generated_text, task="<OD>", image_size=(image.width, image.height))
+        # Extract tags from parsed_answer
+        tags = [obj['class'] for obj in parsed_answer]
+    if booru_tags:
+        tags = list(set(tags + booru_tags))
+    return ", ".join(tags)
 def get_booru_image(booru, image_id):
     if booru == "Gelbooru":
         url = f"https://gelbooru.com/index.php?page=dapi&s=post&q=index&json=1&id={image_id}"
     return img, tags
 def update_image(image_type, booru, image_id, uploaded_image):
     if image_type == "Anime" and booru != "Upload":
         image, booru_tags = get_booru_image(booru, image_id)