t03i commited on
Commit
94a6abc
1 Parent(s): 2646ade

Fix code example

Browse files
Files changed (1) hide show
  1. README.md +18 -16
README.md CHANGED
@@ -1,5 +1,4 @@
1
  ---
2
- language: protein
3
  tags:
4
  - protein language model
5
  datasets:
@@ -38,26 +37,29 @@ An extensive, interactive example on how to use this model for common tasks can
38
  Here is how to use this model to extract the features of a given protein sequence in PyTorch:
39
 
40
  ```python
41
- from transformers import T5Tokenizer, T5EncoderModel
42
- import torch
 
43
 
44
- tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)
 
 
 
45
 
46
- model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc", torch_dtype=torch.float16)
47
-
48
- sequences_Example = ["A E T C Z A O","S K T Z P"]
49
-
50
- sequences_Example = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example]
51
 
52
- ids = tokenizer.batch_encode_plus(seqs, add_special_tokens=True, padding="longest")
 
 
 
 
53
 
54
- input_ids = torch.tensor(ids['input_ids'])
55
- attention_mask = torch.tensor(ids['attention_mask'])
56
 
57
- with torch.no_grad():
58
- embedding_rpr = model(input_ids=input_ids,attention_mask=attention_mask)
59
- emb_0 = embedding_repr.last_hidden_state[0,:6]
60
- emb_1 = embedding_repr.last_hidden_state[1,:4]
61
  ```
62
 
63
  **NOTE**: Please make sure to explicitly set the model to `float16` (`T5EncoderModel.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', torch_dtype=torch.float16)`) otherwise, the generated embeddings will be full precision.
 
1
  ---
 
2
  tags:
3
  - protein language model
4
  datasets:
 
37
  Here is how to use this model to extract the features of a given protein sequence in PyTorch:
38
 
39
  ```python
40
+ sequence_examples = ["PRTEINO", "SEQWENCE"]
41
+ # this will replace all rare/ambiguous amino acids by X and introduce white-space between all amino acids
42
+ sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequence_examples]
43
 
44
+ # tokenize sequences and pad up to the longest sequence in the batch
45
+ ids = tokenizer.batch_encode_plus(sequence_examples, add_special_tokens=True, padding="longest")
46
+ input_ids = torch.tensor(ids['input_ids']).to(device)
47
+ attention_mask = torch.tensor(ids['attention_mask']).to(device)
48
 
49
+ # generate embeddings
50
+ with torch.no_grad():
51
+ embedding_repr = model(input_ids=input_ids,attention_mask=attention_mask)
 
 
52
 
53
+ # extract embeddings for the first ([0,:]) sequence in the batch while removing padded & special tokens ([0,:7])
54
+ emb_0 = embedding_repr.last_hidden_state[0,:7] # shape (7 x 1024)
55
+ print(f"Shape of per-residue embedding of first sequences: {emb_0.shape}")
56
+ # do the same for the second ([1,:]) sequence in the batch while taking into account different sequence lengths ([1,:8])
57
+ emb_1 = embedding_repr.last_hidden_state[1,:8] # shape (8 x 1024)
58
 
59
+ # if you want to derive a single representation (per-protein embedding) for the whole protein
60
+ emb_0_per_protein = emb_0.mean(dim=0) # shape (1024)
61
 
62
+ print(f"Shape of per-protein embedding of first sequences: {emb_0_per_protein.shape}")
 
 
 
63
  ```
64
 
65
  **NOTE**: Please make sure to explicitly set the model to `float16` (`T5EncoderModel.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', torch_dtype=torch.float16)`) otherwise, the generated embeddings will be full precision.