finalf0 commited on
Commit
def3d69
1 Parent(s): 5893ca4

add flash-attn

Browse files
Files changed (2) hide show
  1. app.py +4 -4
  2. requirements.txt +1 -0
app.py CHANGED
@@ -29,8 +29,6 @@ import modelscope_studio as mgr
29
  os.system("pip list|grep torch")
30
  os.system("pip list|grep trans")
31
  os.system("pip list|grep flash")
32
- os.system("nvidia-smi")
33
- os.system("ll /usr/local/cuda*")
34
 
35
  # Argparser
36
  parser = argparse.ArgumentParser(description='demo')
@@ -46,7 +44,8 @@ if 'int4' in model_path:
46
  if device == 'mps':
47
  print('Error: running int4 model with bitsandbytes on Mac is not supported right now.')
48
  exit()
49
- model = AutoModel.from_pretrained(model_path, trust_remote_code=True, attn_implementation='sdpa')
 
50
  else:
51
  if args.multi_gpus:
52
  from accelerate import load_checkpoint_and_dispatch, init_empty_weights, infer_auto_device_map
@@ -72,7 +71,8 @@ else:
72
 
73
  model = load_checkpoint_and_dispatch(model, model_path, dtype=torch.bfloat16, device_map=device_map)
74
  else:
75
- model = AutoModel.from_pretrained(model_path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16)
 
76
  model = model.to(device=device)
77
  tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
78
  model.eval()
 
29
  os.system("pip list|grep torch")
30
  os.system("pip list|grep trans")
31
  os.system("pip list|grep flash")
 
 
32
 
33
  # Argparser
34
  parser = argparse.ArgumentParser(description='demo')
 
44
  if device == 'mps':
45
  print('Error: running int4 model with bitsandbytes on Mac is not supported right now.')
46
  exit()
47
+ #model = AutoModel.from_pretrained(model_path, trust_remote_code=True, attn_implementation='sdpa')
48
+ model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
49
  else:
50
  if args.multi_gpus:
51
  from accelerate import load_checkpoint_and_dispatch, init_empty_weights, infer_auto_device_map
 
71
 
72
  model = load_checkpoint_and_dispatch(model, model_path, dtype=torch.bfloat16, device_map=device_map)
73
  else:
74
+ #model = AutoModel.from_pretrained(model_path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16)
75
+ model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
76
  model = model.to(device=device)
77
  tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
78
  model.eval()
requirements.txt CHANGED
@@ -3,6 +3,7 @@ torch==2.1.2
3
  torchvision==0.16.2
4
  transformers==4.40.2
5
  sentencepiece==0.1.99
 
6
  opencv-python
7
  decord
8
  gradio==4.22.0
 
3
  torchvision==0.16.2
4
  transformers==4.40.2
5
  sentencepiece==0.1.99
6
+ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.2/flash_attn-2.6.2+cu123torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
7
  opencv-python
8
  decord
9
  gradio==4.22.0