estyle commited on
Commit
edb09fc
·
verified ·
1 Parent(s): f29bcc9

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +3 -16
README.md CHANGED
@@ -2,8 +2,6 @@
2
  language: en
3
  license: mit
4
  ---
5
- # Under testing
6
-
7
  # Kosmos-2.5
8
 
9
  [Microsoft Document AI](https://www.microsoft.com/en-us/research/project/document-ai/) | [GitHub](https://github.com/microsoft/unilm/tree/master/kosmos-2.5)
@@ -18,41 +16,32 @@ Kosmos-2.5 is a multimodal literate model for machine reading of text-intensive
18
  Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
19
 
20
  ## Use with transformers:
21
- ```bash
22
- pip install git+https://github.com/tic-top/transformers.git
23
- ```
24
  ```python
25
  from PIL import Image
26
  import requests
27
  import torch
28
- from transformers import AutoProcessor, AutoModelForVision2Seq
29
  import re
30
-
31
- repo = "kirp/kosmos2_5"
32
  device = "cuda:0"
33
  dtype = torch.bfloat16
34
- model = AutoModelForVision2Seq.from_pretrained(repo, device_map=device, torch_dtype=dtype)
35
  processor = AutoProcessor.from_pretrained(repo)
36
-
37
  url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
38
  image = Image.open(requests.get(url, stream=True).raw)
39
  prompt = "<ocr>" # <md>
40
-
41
  inputs = processor(text=prompt, images=image, return_tensors="pt")
42
  height, width = inputs.pop("height"), inputs.pop("width")
43
  raw_width, raw_height = image.size
44
  scale_height = raw_height / height
45
  scale_width = raw_width / width
46
-
47
  inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
48
  inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
49
-
50
  generated_ids = model.generate(
51
  **inputs,
52
  max_new_tokens=1024,
53
  )
54
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
55
-
56
  def postprocess(y, scale_height, scale_width):
57
  y = y.replace(prompt, "")
58
  if "<md>" in prompt:
@@ -73,7 +62,6 @@ def postprocess(y, scale_height, scale_width):
73
  y1 = int(y1 * scale_height)
74
  info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}"
75
  return info
76
-
77
  output_text = postprocess(generated_text[0], scale_height, scale_width)
78
  print(output_text)
79
  ```
@@ -115,4 +103,3 @@ The content of this project itself is licensed under the [MIT](https://github.co
115
  [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
116
 
117
 
118
-
 
2
  language: en
3
  license: mit
4
  ---
 
 
5
  # Kosmos-2.5
6
 
7
  [Microsoft Document AI](https://www.microsoft.com/en-us/research/project/document-ai/) | [GitHub](https://github.com/microsoft/unilm/tree/master/kosmos-2.5)
 
16
  Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
17
 
18
  ## Use with transformers:
 
 
 
19
  ```python
20
  from PIL import Image
21
  import requests
22
  import torch
23
+ from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
24
  import re
25
+ repo = "microsoft/kosmos-2.5"
 
26
  device = "cuda:0"
27
  dtype = torch.bfloat16
28
+ model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
29
  processor = AutoProcessor.from_pretrained(repo)
 
30
  url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
31
  image = Image.open(requests.get(url, stream=True).raw)
32
  prompt = "<ocr>" # <md>
 
33
  inputs = processor(text=prompt, images=image, return_tensors="pt")
34
  height, width = inputs.pop("height"), inputs.pop("width")
35
  raw_width, raw_height = image.size
36
  scale_height = raw_height / height
37
  scale_width = raw_width / width
 
38
  inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
39
  inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
 
40
  generated_ids = model.generate(
41
  **inputs,
42
  max_new_tokens=1024,
43
  )
44
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
 
45
  def postprocess(y, scale_height, scale_width):
46
  y = y.replace(prompt, "")
47
  if "<md>" in prompt:
 
62
  y1 = int(y1 * scale_height)
63
  info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}"
64
  return info
 
65
  output_text = postprocess(generated_text[0], scale_height, scale_width)
66
  print(output_text)
67
  ```
 
103
  [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
104
 
105