Image Feature Extraction
ColPali
Safetensors
English
vidore
vidore-experimental
tonywu71 commited on
Commit
2d54d5d
·
verified ·
1 Parent(s): 1817812

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +40 -76
README.md CHANGED
@@ -18,7 +18,7 @@ It was introduced in the paper [ColPali: Efficient Document Retrieval with Visio
18
 
19
  ## Version specificity
20
 
21
- This version is trained with `colpali-engine==0.2.0`.
22
 
23
  Compared to `colpali`, this version is trained with right padding for queries to fix unwanted tokens in the query encoding.
24
  It also stems from the fixed `vidore/colpaligemma-3b-pt-448-base` to guarantee deterministic projection layer initialization.
@@ -52,89 +52,53 @@ We train on an 8 GPU setup with data parallelism, a learning rate of 5e-5 with l
52
 
53
  ## Usage
54
 
 
 
55
  ```bash
56
- pip install colpali-engine==0.2.0
57
  ```
58
 
 
59
 
60
  ```python
 
 
61
  import torch
62
- import typer
63
- from torch.utils.data import DataLoader
64
- from tqdm import tqdm
65
- from transformers import AutoProcessor
66
  from PIL import Image
67
 
68
- from colpali_engine.models.paligemma_colbert_architecture import ColPali
69
- from colpali_engine.trainer.retrieval_evaluator import CustomEvaluator
70
- from colpali_engine.utils.colpali_processing_utils import process_images, process_queries
71
- from colpali_engine.utils.image_from_page_utils import load_from_dataset
72
-
73
-
74
- def main() -> None:
75
- """Example script to run inference with ColPali"""
76
-
77
- # Load model
78
- model_name = "vidore/colpali-v1.2"
79
- model = ColPali.from_pretrained("vidore/colpaligemma-3b-pt-448-base", torch_dtype=torch.bfloat16, device_map="cuda").eval()
80
- model.load_adapter(model_name)
81
- model = model.eval()
82
- processor = AutoProcessor.from_pretrained(model_name)
83
-
84
- # select images -> load_from_pdf(<pdf_path>), load_from_image_urls(["<url_1>"]), load_from_dataset(<path>)
85
- images = load_from_dataset("vidore/docvqa_test_subsampled")
86
- queries = ["From which university does James V. Fiorca come ?", "Who is the japanese prime minister?"]
87
-
88
- # run inference - docs
89
- dataloader = DataLoader(
90
- images,
91
- batch_size=4,
92
- shuffle=False,
93
- collate_fn=lambda x: process_images(processor, x),
94
- )
95
- ds = []
96
- for batch_doc in tqdm(dataloader):
97
- with torch.no_grad():
98
- batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
99
- embeddings_doc = model(**batch_doc)
100
- ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
101
-
102
- # run inference - queries
103
- dataloader = DataLoader(
104
- queries,
105
- batch_size=4,
106
- shuffle=False,
107
- collate_fn=lambda x: process_queries(processor, x, Image.new("RGB", (448, 448), (255, 255, 255))),
108
- )
109
-
110
- qs = []
111
- for batch_query in dataloader:
112
- with torch.no_grad():
113
- batch_query = {k: v.to(model.device) for k, v in batch_query.items()}
114
- embeddings_query = model(**batch_query)
115
- qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
116
-
117
- # run evaluation
118
- retriever_evaluator = CustomEvaluator(is_multi_vector=True)
119
- scores = retriever_evaluator.evaluate(qs, ds)
120
- print(scores.argmax(axis=1))
121
-
122
-
123
- if __name__ == "__main__":
124
- typer.run(main)
125
-
126
- ```
127
-
128
- **Note:** If you need to further train ColPali from this adapter, you should run:
129
-
130
- ```python
131
- lora_config = LoraConfig.from_pretrained("vidore/colpali-v1.1")
132
- lora_config.inference_mode = False # force training mode for fine-tuning
133
-
134
- model = get_peft_model(model, lora_config)
135
-
136
- print("after")
137
- model.print_trainable_parameters()
138
  ```
139
 
140
  ## Limitations
 
18
 
19
  ## Version specificity
20
 
21
+ This version is trained with `colpali-engine==0.2.0` but can be loaded for any version `>=0.2.0`.
22
 
23
  Compared to `colpali`, this version is trained with right padding for queries to fix unwanted tokens in the query encoding.
24
  It also stems from the fixed `vidore/colpaligemma-3b-pt-448-base` to guarantee deterministic projection layer initialization.
 
52
 
53
  ## Usage
54
 
55
+ Install [`colpali-engine`](https://github.com/illuin-tech/colpali):
56
+
57
  ```bash
58
+ pip install colpali-engine>=0.3.0,<0.4.0
59
  ```
60
 
61
+ Then run the following code:
62
 
63
  ```python
64
+ from typing import cast
65
+
66
  import torch
 
 
 
 
67
  from PIL import Image
68
 
69
+ from colpali_engine.models import ColPali, ColPaliProcessor
70
+
71
+ model = cast(
72
+ ColPali,
73
+ ColPali.from_pretrained(
74
+ "vidore/colpali-v1.2",
75
+ torch_dtype=torch.bfloat16,
76
+ device_map="cuda:0", # or "mps" if on Apple Silicon
77
+ ),
78
+ )
79
+
80
+ processor = cast(ColPaliProcessor, ColPaliProcessor.from_pretrained("google/paligemma-3b-mix-448"))
81
+
82
+ # Your inputs
83
+ images = [
84
+ Image.new("RGB", (32, 32), color="white"),
85
+ Image.new("RGB", (16, 16), color="black"),
86
+ ]
87
+ queries = [
88
+ "Is attention really all you need?",
89
+ "Are Benjamin, Antoine, Merve, and Jo best friends?",
90
+ ]
91
+
92
+ # Process the inputs
93
+ batch_images = processor.process_images(images).to(model.device)
94
+ batch_queries = processor.process_queries(queries).to(model.device)
95
+
96
+ # Forward pass
97
+ with torch.no_grad():
98
+ image_embeddings = model(**batch_images)
99
+ querry_embeddings = model(**batch_queries)
100
+
101
+ scores = processor.score_multi_vector(querry_embeddings, image_embeddings)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  ```
103
 
104
  ## Limitations