CarolXia commited on
Commit
8bb500b
·
1 Parent(s): 556b48f

Add threads

Browse files
Files changed (1) hide show
  1. app.py +22 -13
app.py CHANGED
@@ -49,8 +49,8 @@ st.write('Loading the pretrained model ...')
49
  model_name = "CarolXia/pii-kd-deberta-v2"
50
  # config = PeftConfig.from_pretrained(model_name)
51
  model = DebertaV2ForTokenClassification.from_pretrained(model_name, token=st.secrets["HUGGINGFACE_TOKEN"])
52
- # if torch.cuda.is_available():
53
- # model = model.to("cuda")
54
  # Try quantization instead
55
  # model = AutoModelForTokenClassification.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
56
  tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/mdeberta-v3-base", token=st.secrets["HUGGINGFACE_TOKEN"])
@@ -141,18 +141,27 @@ entity_set=dict()
141
  dataset = load_dataset("Isotonic/pii-masking-200k", split="train")
142
  unmasked_text = dataset['unmasked_text'] # This will load the entire column inmemory. Must do this to avoid I/O delay later
143
 
144
- st.write('Size of the dataset ', dataset.num_rows)
145
- sizes = [0] * 2
146
  start = time.time()
147
- t1 = threading.Thread(target=process_datasets, args=(0, 25, unmasked_text, sizes, 0, entity_set, []))
148
- t2 = threading.Thread(target=process_datasets, args=(25, 50, unmasked_text, sizes, 1, entity_set, []))
149
- with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, record_shapes=True) as prof:
150
- process_datasets(0, 50, unmasked_text, sizes, 0, entity_set, [])
151
- # t1.start()
152
- # t2.start()
153
-
154
- # t1.join()
155
- # t2.join()
 
 
 
 
 
 
 
 
 
156
 
157
  end = time.time()
158
  length = end - start
 
49
  model_name = "CarolXia/pii-kd-deberta-v2"
50
  # config = PeftConfig.from_pretrained(model_name)
51
  model = DebertaV2ForTokenClassification.from_pretrained(model_name, token=st.secrets["HUGGINGFACE_TOKEN"])
52
+ if torch.cuda.is_available():
53
+ model = model.to("cuda")
54
  # Try quantization instead
55
  # model = AutoModelForTokenClassification.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
56
  tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/mdeberta-v3-base", token=st.secrets["HUGGINGFACE_TOKEN"])
 
141
  dataset = load_dataset("Isotonic/pii-masking-200k", split="train")
142
  unmasked_text = dataset['unmasked_text'] # This will load the entire column inmemory. Must do this to avoid I/O delay later
143
 
144
+ st.write('Number of rows in the dataset ', dataset.num_rows)
145
+ sizes = [0] * 5
146
  start = time.time()
147
+ # t0 = threading.Thread(target=process_datasets, args=(0, 50, unmasked_text, sizes, 0, entity_set, []))
148
+ # t1 = threading.Thread(target=process_datasets, args=(25, 50, unmasked_text, sizes, 1, entity_set, []))
149
+ # t2 = threading.Thread(target=process_datasets, args=(20, 30, unmasked_text, sizes, 2, entity_set, []))
150
+ # t3 = threading.Thread(target=process_datasets, args=(30, 40, unmasked_text, sizes, 3, entity_set, []))
151
+ # t4 = threading.Thread(target=process_datasets, args=(40, 50, unmasked_text, sizes, 4, entity_set, []))
152
+ # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, record_shapes=True) as prof:
153
+ process_datasets(0, 50, unmasked_text, sizes, 0, entity_set, [])
154
+ # t0.start()
155
+ # t1.start()
156
+ # t2.start()
157
+ # t3.start()
158
+ # t4.start()
159
+
160
+ # t0.join()
161
+ # t1.join()
162
+ # t2.join()
163
+ # t3.join()
164
+ # t4.join()
165
 
166
  end = time.time()
167
  length = end - start