czczup commited on
Commit
20758ec
1 Parent(s): dda6d62

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ examples/red-panda.mp4 filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -224,6 +224,60 @@ responses = model.batch_chat(tokenizer, pixel_values,
224
  for question, response in zip(questions, responses):
225
  print(f'User: {question}')
226
  print(f'Assistant: {response}')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  ```
228
 
229
  ## License
 
224
  for question, response in zip(questions, responses):
225
  print(f'User: {question}')
226
  print(f'Assistant: {response}')
227
+
228
+ # video multi-round conversation (视频多轮对话)
229
+ def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
230
+ if bound:
231
+ start, end = bound[0], bound[1]
232
+ else:
233
+ start, end = -100000, 100000
234
+ start_idx = max(first_idx, round(start * fps))
235
+ end_idx = min(round(end * fps), max_frame)
236
+ seg_size = float(end_idx - start_idx) / num_segments
237
+ frame_indices = np.array([
238
+ int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
239
+ for idx in range(num_segments)
240
+ ])
241
+ return frame_indices
242
+
243
+ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
244
+ vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
245
+ max_frame = len(vr) - 1
246
+ fps = float(vr.get_avg_fps())
247
+
248
+ pixel_values_list, num_patches_list = [], []
249
+ transform = build_transform(input_size=input_size)
250
+ frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
251
+ for frame_index in frame_indices:
252
+ img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
253
+ img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
254
+ pixel_values = [transform(tile) for tile in img]
255
+ pixel_values = torch.stack(pixel_values)
256
+ num_patches_list.append(pixel_values.shape[0])
257
+ pixel_values_list.append(pixel_values)
258
+ pixel_values = torch.cat(pixel_values_list)
259
+ return pixel_values, num_patches_list
260
+
261
+
262
+ video_path = './examples/red-panda.mp4'
263
+ # pixel_values, num_patches_list = load_video(video_path, num_segments=32, max_num=1)
264
+ pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=2)
265
+ pixel_values = pixel_values.to(torch.bfloat16).cuda()
266
+ video_prefix = '\n'.join([f'Frame{i+1}:<image>' for i in range(len(num_patches_list))])
267
+ question = video_prefix + 'What is the red panda doing?'
268
+ # Frame1:<image>\nFrame2:<image>\n...\nFrame31:<image>\n{question}
269
+ response, history = model.chat(tokenizer, pixel_values, question, generation_config,
270
+ num_patches_list=num_patches_list,
271
+ history=None, return_history=True)
272
+ print(f'User: {question}')
273
+ print(f'Assistant: {response}')
274
+
275
+ question = 'Describe this video in detail.'
276
+ response, history = model.chat(tokenizer, pixel_values, question, generation_config,
277
+ num_patches_list=num_patches_list,
278
+ history=history, return_history=True)
279
+ print(f'User: {question}')
280
+ print(f'Assistant: {response}')
281
  ```
282
 
283
  ## License
examples/image1.jpg ADDED
examples/image2.jpg ADDED
examples/red-panda.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d921c07bb97224d65a37801541d246067f0d506f08723ffa1ad85c217907ccb8
3
+ size 1867237