approach0
/

dpr-cocondenser-120

+## About
+Here we share a pretrained bert model that is aware of math tokens. The math tokens are treated specially and are tokenized using [pya0](https://github.com/approach0/pya0), which adds very limited new tokens for latex markup (total vocabulary is just 31061).
+### Usage
+Download and try it out
+```sh
+pip install pya0==0.3.2
+wget https://vault.cs.uwaterloo.ca/s/gqstFZmWHCLGXe3/download -O ckpt.tar.gz
+mkdir -p ckpt
+tar xzf ckpt.tar.gz -C ckpt --strip-components=1
+python test.py --test_file test.txt
+```
+### Test file format
+Modify the test examples in `test.txt` to play with it.
+The test file is tab separated, the first column is additional positions you want to mask for the right-side sentence (useful for masking tokens in math markups). An zero means no additional mask positions.
+### Example output
+![](https://i.imgur.com/xpl87KO.png)

test.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import re
+import os
+import fire
+import torch
+from functools import partial
+from transformers import BertTokenizer
+from transformers import BertForPreTraining
+from pya0.preprocess import preprocess_for_transformer
+def highlight_masked(txt):
+    return re.sub(r"(\[MASK\])", '\033[92m' + r"\1" + '\033[0m', txt)
+def classifier_hook(tokenizer, tokens, topk, module, inputs, outputs):
+    unmask_scores, seq_rel_scores = outputs
+    MSK_CODE = 103
+    token_ids = tokens['input_ids'][0]
+    masked_idx = (token_ids == torch.tensor([MSK_CODE]))
+    scores = unmask_scores[0][masked_idx]
+    cands = torch.argsort(scores, dim=1, descending=True)
+    for i, mask_cands in enumerate(cands):
+        top_cands = mask_cands[:topk].detach().cpu()
+        print(f'MASK[{i}] top candidates: ' +
+            str(tokenizer.convert_ids_to_tokens(top_cands)))
+def test(
+    test_file='test.txt',
+    ckpt_bert='ckpt/bert-pretrained-for-math-7ep/6_3_1382',
+    ckpt_tokenizer='ckpt/bert-tokenizer-for-math'
+    ):
+    tokenizer = BertTokenizer.from_pretrained(ckpt_tokenizer)
+    model = BertForPreTraining.from_pretrained(ckpt_bert,
+        tie_word_embeddings=True
+    )
+    with open(test_file, 'r') as fh:
+        for line in fh:
+            # parse test file line
+            line = line.rstrip()
+            fields = line.split('\t')
+            maskpos = list(map(int, fields[0].split(',')))
+            # preprocess and mask words
+            sentence = preprocess_for_transformer(fields[1])
+            tokens = sentence.split()
+            for pos in filter(lambda x: x!=0, maskpos):
+                tokens[pos-1] = '[MASK]'
+            sentence = ' '.join(tokens)
+            tokens = tokenizer(sentence,
+                padding=True, truncation=True, return_tensors="pt")
+            #print(tokenizer.decode(tokens['input_ids'][0]))
+            print('*', highlight_masked(sentence))
+            # print unmasked
+            with torch.no_grad():
+                display = ['\n', '']
+                classifier = model.cls
+                partial_hook = partial(classifier_hook, tokenizer, tokens, 3)
+                hook = classifier.register_forward_hook(partial_hook)
+                model(**tokens)
+                hook.remove()
+if __name__ == '__main__':
+    os.environ["PAGER"] = 'cat'
+    fire.Fire(test)

test.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+0	She needs to [MASK] that [MASK] gets only ten minutes.
+8	Determine the [MASK] of [imath]f(x) = x + \sqrt{ 4 - x^2}[/imath] without [MASK]
+4,12	Solve [imath]y''-4y'+4y=xe^x[/imath]
+4	[imath]f(x, y)[/imath]
+2	[imath]x + x = 2x[/imath]
+10,11	With Euler's [MASK], it [MASK] to [imath]\int_0^\infty \frac{1+x^2}{1+x}dx[/imath]
+6,12	Proof by [MASK] that [imath]n!>3n[/imath] [MASK] [imath]n>6[/imath]