smostafanejad commited on
Commit
369d8de
β€’
1 Parent(s): acfb17d

Upload accessing_the_data.ipynb

Browse files
Files changed (1) hide show
  1. accessing_the_data.ipynb +8 -1014
accessing_the_data.ipynb CHANGED
@@ -17,7 +17,7 @@
17
  },
18
  {
19
  "cell_type": "code",
20
- "execution_count": 16,
21
  "metadata": {},
22
  "outputs": [],
23
  "source": [
@@ -35,7 +35,7 @@
35
  },
36
  {
37
  "cell_type": "code",
38
- "execution_count": 4,
39
  "metadata": {},
40
  "outputs": [],
41
  "source": [
@@ -63,23 +63,9 @@
63
  },
64
  {
65
  "cell_type": "code",
66
- "execution_count": 5,
67
  "metadata": {},
68
- "outputs": [
69
- {
70
- "data": {
71
- "text/plain": [
72
- "IterableDataset({\n",
73
- " features: ['cid', 'state', 'pubchem-inchi', 'pubchem-charge', 'pubchem-version', 'name', 'coordinates', 'atomic-numbers', 'atom-count', 'heavy-atom-count', 'core-electrons', 'bond-order', 'connection-indices', 'formula', 'version', 'obabel-inchi', 'pm6-obabel-canonical-smiles', 'charge', 'energy-beta-gap', 'energy-beta-homo', 'energy-beta-lumo', 'energy-alpha-gap', 'energy-alpha-homo', 'energy-alpha-lumo', 'total-energy', 'homos', 'orbital-energies', 'mo-count', 'basis-count', 'multiplicity', 'molecular-mass', 'number-of-atoms', 'lowdin-partial-charges', 'mulliken-partial-charges', 'dipole-moment', 'pubchem-multiplicity', 'pubchem-obabel-canonical-smiles', 'pubchem-isomeric-smiles', 'pubchem-molecular-weight', 'pubchem-molecular-formula'],\n",
74
- " n_shards: 430\n",
75
- "})"
76
- ]
77
- },
78
- "execution_count": 5,
79
- "metadata": {},
80
- "output_type": "execute_result"
81
- }
82
- ],
83
  "source": [
84
  "# load the dataset\n",
85
  "hub_dataset = load_dataset(path=path,\n",
@@ -135,59 +121,9 @@
135
  },
136
  {
137
  "cell_type": "code",
138
- "execution_count": 13,
139
  "metadata": {},
140
- "outputs": [
141
- {
142
- "data": {
143
- "text/plain": [
144
- "['cid',\n",
145
- " 'state',\n",
146
- " 'pubchem-inchi',\n",
147
- " 'pubchem-charge',\n",
148
- " 'pubchem-version',\n",
149
- " 'name',\n",
150
- " 'coordinates',\n",
151
- " 'atomic-numbers',\n",
152
- " 'atom-count',\n",
153
- " 'heavy-atom-count',\n",
154
- " 'core-electrons',\n",
155
- " 'bond-order',\n",
156
- " 'connection-indices',\n",
157
- " 'formula',\n",
158
- " 'version',\n",
159
- " 'obabel-inchi',\n",
160
- " 'pm6-obabel-canonical-smiles',\n",
161
- " 'charge',\n",
162
- " 'energy-beta-gap',\n",
163
- " 'energy-beta-homo',\n",
164
- " 'energy-beta-lumo',\n",
165
- " 'energy-alpha-gap',\n",
166
- " 'energy-alpha-homo',\n",
167
- " 'energy-alpha-lumo',\n",
168
- " 'total-energy',\n",
169
- " 'homos',\n",
170
- " 'orbital-energies',\n",
171
- " 'mo-count',\n",
172
- " 'basis-count',\n",
173
- " 'multiplicity',\n",
174
- " 'molecular-mass',\n",
175
- " 'number-of-atoms',\n",
176
- " 'lowdin-partial-charges',\n",
177
- " 'mulliken-partial-charges',\n",
178
- " 'dipole-moment',\n",
179
- " 'pubchem-multiplicity',\n",
180
- " 'pubchem-obabel-canonical-smiles',\n",
181
- " 'pubchem-isomeric-smiles',\n",
182
- " 'pubchem-molecular-weight',\n",
183
- " 'pubchem-molecular-formula']"
184
- ]
185
- },
186
- "execution_count": 13,
187
- "metadata": {},
188
- "output_type": "execute_result"
189
- }
190
- ],
191
  "source": [
192
  "# print the column names\n",
193
  "hub_dataset.column_names"
@@ -205,951 +141,9 @@
205
  },
206
  {
207
  "cell_type": "code",
208
- "execution_count": 15,
209
  "metadata": {},
210
- "outputs": [
211
- {
212
- "data": {
213
- "text/plain": [
214
- "[{'cid': 1,\n",
215
- " 'state': 'S0',\n",
216
- " 'pubchem-inchi': 'InChI=1S/C9H17NO4/c1-7(11)14-8(5-9(12)13)6-10(2,3)4/h8H,5-6H2,1-4H3',\n",
217
- " 'pubchem-charge': 0,\n",
218
- " 'pubchem-version': '20160829',\n",
219
- " 'name': '[email protected]',\n",
220
- " 'coordinates': [4.543149670829423,\n",
221
- " -2.8411897941733857,\n",
222
- " -1.6418598810432616,\n",
223
- " 5.164339625816055,\n",
224
- " -1.776079871333543,\n",
225
- " -0.8127099411272803,\n",
226
- " 6.303009543349172,\n",
227
- " -1.3517299020731781,\n",
228
- " -0.8573999378940053,\n",
229
- " 4.255369691704345,\n",
230
- " -1.2926799063382324,\n",
231
- " 0.08501999385276329,\n",
232
- " 4.60962966603199,\n",
233
- " -0.12267999112548451,\n",
234
- " 0.9209899332554315,\n",
235
- " 3.2824497621808693,\n",
236
- " 0.5602999593942104,\n",
237
- " 1.1527099164801087,\n",
238
- " 3.3767697553673353,\n",
239
- " 1.7012998767313179,\n",
240
- " 2.2331298382255156,\n",
241
- " 3.9935697106646764,\n",
242
- " 1.3808998999270228,\n",
243
- " 3.2894897616953807,\n",
244
- " 2.881229791246801,\n",
245
- " 2.7805597985320656,\n",
246
- " 1.9378698595831967,\n",
247
- " 5.234679620732287,\n",
248
- " -0.683349950484708,\n",
249
- " 2.212169839745514,\n",
250
- " 6.611009521024224,\n",
251
- " -0.1308399905026251,\n",
252
- " 2.561759814391751,\n",
253
- " 7.676599443790074,\n",
254
- " -0.7797299435035565,\n",
255
- " 1.7129698758981244,\n",
256
- " 6.666809516973522,\n",
257
- " 1.3677999008968658,\n",
258
- " 2.3902798268023666,\n",
259
- " 6.871359502159529,\n",
260
- " -0.444549967769325,\n",
261
- " 4.023889708442672,\n",
262
- " 4.4186396798669705,\n",
263
- " -2.4972898190777815,\n",
264
- " -2.6834198055950926,\n",
265
- " 3.5434197432646983,\n",
266
- " -3.130939773157949,\n",
267
- " -1.275939907569318,\n",
268
- " 5.173619625161661,\n",
269
- " -3.7428297288108783,\n",
270
- " -1.6751898786439012,\n",
271
- " 5.315839614871386,\n",
272
- " 0.506469963304397,\n",
273
- " 0.3344999757684973,\n",
274
- " 2.5048198185388078,\n",
275
- " -0.14947998914622534,\n",
276
- " 1.4864398923099749,\n",
277
- " 2.8906797905706103,\n",
278
- " 1.0035699272902916,\n",
279
- " 0.2186999841469097,\n",
280
- " 5.252789619400259,\n",
281
- " -1.7890198704060034,\n",
282
- " 2.186789841567616,\n",
283
- " 4.524179672197005,\n",
284
- " -0.36438997361718856,\n",
285
- " 3.068649777657863,\n",
286
- " 7.449789460262727,\n",
287
- " -0.6580899523412297,\n",
288
- " 0.6238299547913445,\n",
289
- " 7.7404594391760275,\n",
290
- " -1.8650498648728673,\n",
291
- " 1.890259863072561,\n",
292
- " 8.670379371800149,\n",
293
- " -0.34142997526909646,\n",
294
- " 1.891069862987387,\n",
295
- " 6.552169525296595,\n",
296
- " 1.671489878887934,\n",
297
- " 1.3366699031511258,\n",
298
- " 7.59468944975336,\n",
299
- " 1.8007598695576095,\n",
300
- " 2.782579798388253,\n",
301
- " 5.791549580395056,\n",
302
- " 1.8473198661374974,\n",
303
- " 2.9428997867983613,\n",
304
- " 6.800889507241287,\n",
305
- " -1.5241398895588907,\n",
306
- " 4.229209693567309,\n",
307
- " 6.113409557071307,\n",
308
- " 0.06863999502835459,\n",
309
- " 4.660949662319101,\n",
310
- " 7.864189430232259,\n",
311
- " -0.0930299932397971,\n",
312
- " 4.345629685137421],\n",
313
- " 'atomic-numbers': [6,\n",
314
- " 6,\n",
315
- " 8,\n",
316
- " 8,\n",
317
- " 6,\n",
318
- " 6,\n",
319
- " 6,\n",
320
- " 8,\n",
321
- " 8,\n",
322
- " 6,\n",
323
- " 7,\n",
324
- " 6,\n",
325
- " 6,\n",
326
- " 6,\n",
327
- " 1,\n",
328
- " 1,\n",
329
- " 1,\n",
330
- " 1,\n",
331
- " 1,\n",
332
- " 1,\n",
333
- " 1,\n",
334
- " 1,\n",
335
- " 1,\n",
336
- " 1,\n",
337
- " 1,\n",
338
- " 1,\n",
339
- " 1,\n",
340
- " 1,\n",
341
- " 1,\n",
342
- " 1,\n",
343
- " 1],\n",
344
- " 'atom-count': 31,\n",
345
- " 'heavy-atom-count': 14,\n",
346
- " 'core-electrons': [0,\n",
347
- " 0,\n",
348
- " 0,\n",
349
- " 0,\n",
350
- " 0,\n",
351
- " 0,\n",
352
- " 0,\n",
353
- " 0,\n",
354
- " 0,\n",
355
- " 0,\n",
356
- " 0,\n",
357
- " 0,\n",
358
- " 0,\n",
359
- " 0,\n",
360
- " 0,\n",
361
- " 0,\n",
362
- " 0,\n",
363
- " 0,\n",
364
- " 0,\n",
365
- " 0,\n",
366
- " 0,\n",
367
- " 0,\n",
368
- " 0,\n",
369
- " 0,\n",
370
- " 0,\n",
371
- " 0,\n",
372
- " 0,\n",
373
- " 0,\n",
374
- " 0,\n",
375
- " 0,\n",
376
- " 0],\n",
377
- " 'bond-order': [1,\n",
378
- " 1,\n",
379
- " 1,\n",
380
- " 1,\n",
381
- " 2,\n",
382
- " 1,\n",
383
- " 1,\n",
384
- " 1,\n",
385
- " 1,\n",
386
- " 1,\n",
387
- " 1,\n",
388
- " 1,\n",
389
- " 1,\n",
390
- " 1,\n",
391
- " 1,\n",
392
- " 1,\n",
393
- " 1,\n",
394
- " 1,\n",
395
- " 2,\n",
396
- " 1,\n",
397
- " 1,\n",
398
- " 1,\n",
399
- " 1,\n",
400
- " 1,\n",
401
- " 1,\n",
402
- " 1,\n",
403
- " 1,\n",
404
- " 1,\n",
405
- " 1,\n",
406
- " 1],\n",
407
- " 'connection-indices': [15,\n",
408
- " 1,\n",
409
- " 17,\n",
410
- " 1,\n",
411
- " 1,\n",
412
- " 16,\n",
413
- " 1,\n",
414
- " 2,\n",
415
- " 3,\n",
416
- " 2,\n",
417
- " 2,\n",
418
- " 4,\n",
419
- " 4,\n",
420
- " 5,\n",
421
- " 20,\n",
422
- " 6,\n",
423
- " 18,\n",
424
- " 5,\n",
425
- " 23,\n",
426
- " 12,\n",
427
- " 5,\n",
428
- " 6,\n",
429
- " 5,\n",
430
- " 10,\n",
431
- " 6,\n",
432
- " 19,\n",
433
- " 6,\n",
434
- " 7,\n",
435
- " 26,\n",
436
- " 13,\n",
437
- " 12,\n",
438
- " 24,\n",
439
- " 12,\n",
440
- " 25,\n",
441
- " 12,\n",
442
- " 11,\n",
443
- " 9,\n",
444
- " 7,\n",
445
- " 21,\n",
446
- " 10,\n",
447
- " 10,\n",
448
- " 11,\n",
449
- " 10,\n",
450
- " 22,\n",
451
- " 7,\n",
452
- " 8,\n",
453
- " 13,\n",
454
- " 11,\n",
455
- " 13,\n",
456
- " 27,\n",
457
- " 13,\n",
458
- " 28,\n",
459
- " 11,\n",
460
- " 14,\n",
461
- " 14,\n",
462
- " 29,\n",
463
- " 14,\n",
464
- " 31,\n",
465
- " 14,\n",
466
- " 30],\n",
467
- " 'formula': 'C9H17NO4',\n",
468
- " 'version': '1.0',\n",
469
- " 'obabel-inchi': 'InChI=1S/C9H17NO4/c1-7(11)14-8(5-9(12)13)6-10(2,3)4/h8H,5-6H2,1-4H3/t8-/m0/s1',\n",
470
- " 'pm6-obabel-canonical-smiles': '[O]C(=O)C[C@@H](C[N](C)(C)C)OC(=O)C',\n",
471
- " 'charge': 0,\n",
472
- " 'energy-beta-gap': 4.34837933099,\n",
473
- " 'energy-beta-homo': -4.60960862747,\n",
474
- " 'energy-beta-lumo': -0.2612292964799998,\n",
475
- " 'energy-alpha-gap': 4.34837933099,\n",
476
- " 'energy-alpha-homo': -4.60960862747,\n",
477
- " 'energy-alpha-lumo': -0.2612292964799998,\n",
478
- " 'total-energy': -19286.973573267132,\n",
479
- " 'homos': [54],\n",
480
- " 'orbital-energies': [[-522.303488065215,\n",
481
- " -521.209590386205,\n",
482
- " -518.042185166385,\n",
483
- " -517.742859930835,\n",
484
- " -394.46712223881997,\n",
485
- " -281.21333766072,\n",
486
- " -279.556164311175,\n",
487
- " -279.273165906655,\n",
488
- " -279.235069967585,\n",
489
- " -278.79696666828,\n",
490
- " -278.7534284522,\n",
491
- " -278.06770154894,\n",
492
- " -277.890827546115,\n",
493
- " -276.377874537335,\n",
494
- " -30.082186172775,\n",
495
- " -28.150177834225,\n",
496
- " -27.81547779811,\n",
497
- " -26.512052454215002,\n",
498
- " -24.166431062905,\n",
499
- " -22.0412218905,\n",
500
- " -20.950045349995,\n",
501
- " -20.754123377635,\n",
502
- " -20.60446075986,\n",
503
- " -19.681994806665003,\n",
504
- " -17.831620623265,\n",
505
- " -16.88738556203,\n",
506
- " -15.630219572720002,\n",
507
- " -14.933608115439998,\n",
508
- " -14.808435744210001,\n",
509
- " -14.484620262115,\n",
510
- " -13.8233836054,\n",
511
- " -13.145820117655,\n",
512
- " -13.07234937802,\n",
513
- " -12.65329404825,\n",
514
- " -12.460093214395,\n",
515
- " -12.26689238054,\n",
516
- " -11.975730560505,\n",
517
- " -11.722664679540001,\n",
518
- " -11.529463845685001,\n",
519
- " -11.49408904512,\n",
520
- " -11.36891667389,\n",
521
- " -11.246465441165,\n",
522
- " -11.11312965442,\n",
523
- " -10.664141801095,\n",
524
- " -10.54713284538,\n",
525
- " -9.87229049614,\n",
526
- " -9.619224615175,\n",
527
- " -9.543032737035,\n",
528
- " -8.88723835733,\n",
529
- " -8.479067581579999,\n",
530
- " -8.411039118955,\n",
531
- " -7.894022803005001,\n",
532
- " -5.303498946245,\n",
533
- " -5.015058264715,\n",
534
- " -4.60960862747,\n",
535
- " -0.26122929648,\n",
536
- " 0.982331000305,\n",
537
- " 1.6762213190800002,\n",
538
- " 1.9837099701450003,\n",
539
- " 2.266708374665,\n",
540
- " 2.2993620367250003,\n",
541
- " 2.3782750533700003,\n",
542
- " 2.5905238567600004,\n",
543
- " 2.79188810613,\n",
544
- " 3.0395117100849998,\n",
545
- " 3.2381548209499997,\n",
546
- " 3.485778424905,\n",
547
- " 3.5646914415500004,\n",
548
- " 3.6871426742750004,\n",
549
- " 3.7714979679300003,\n",
550
- " 4.291235422385,\n",
551
- " 4.359263885010001,\n",
552
- " 4.468109425210001,\n",
553
- " 4.93614524807,\n",
554
- " 5.12662494342,\n",
555
- " 5.2844509767100005,\n",
556
- " 5.52935344216,\n",
557
- " 5.793303877145,\n",
558
- " 6.0572543121299995,\n",
559
- " 6.381069794225,\n",
560
- " 7.17292109918,\n",
561
- " 7.921234188055001,\n",
562
- " 8.08178135985,\n",
563
- " 8.28858788623,\n",
564
- " 9.401533534775,\n",
565
- " 11.143062177974999,\n",
566
- " 11.98933625303,\n",
567
- " 12.0818549622,\n",
568
- " 12.656015186755,\n",
569
- " 12.857379436125,\n",
570
- " 13.02881116194,\n",
571
- " 13.654673018090001,\n",
572
- " 14.212506411615,\n",
573
- " 14.533600755205,\n",
574
- " 14.59074466381,\n",
575
- " 15.63566184973,\n",
576
- " 15.877843176675,\n",
577
- " 16.08737084156,\n",
578
- " 16.419349739170002,\n",
579
- " 16.862895315485,\n",
580
- " 17.05609614934,\n",
581
- " 17.407123016485002,\n",
582
- " 17.695563698015,\n",
583
- " 18.348636939215,\n",
584
- " 18.69966380636,\n",
585
- " 18.90647033274,\n",
586
- " 19.12960369015,\n",
587
- " 19.47246714178,\n",
588
- " 19.58675495899,\n",
589
- " 19.861589947995,\n",
590
- " 20.78405590119,\n",
591
- " 21.1160347988,\n",
592
- " 21.4425714194,\n",
593
- " 21.50243646651,\n",
594
- " 21.831694225615,\n",
595
- " 22.234422724355003,\n",
596
- " 22.277960940435,\n",
597
- " 22.58000731449,\n",
598
- " 22.952803289675,\n",
599
- " 23.322878126355,\n",
600
- " 23.57050173031,\n",
601
- " 23.60315539237,\n",
602
- " 23.6739049935,\n",
603
- " 23.95690339802,\n",
604
- " 24.43038149789,\n",
605
- " 24.754196979985,\n",
606
- " 24.792292919055,\n",
607
- " 25.06168563105,\n",
608
- " 25.21406938733,\n",
609
- " 25.29570354248,\n",
610
- " 25.461692991285,\n",
611
- " 25.739249118794998,\n",
612
- " 26.005920692285,\n",
613
- " 26.33517845139,\n",
614
- " 26.6127345789,\n",
615
- " 27.453566376945,\n",
616
- " 27.722959088939998,\n",
617
- " 28.057659125054997,\n",
618
- " 28.770597413365,\n",
619
- " 29.211421851174997,\n",
620
- " 29.859052815365,\n",
621
- " 30.648182981815,\n",
622
- " 31.352957854609997,\n",
623
- " 31.71759041428,\n",
624
- " 32.846862893855004,\n",
625
- " 33.12714015987,\n",
626
- " 33.230543423060006,\n",
627
- " 34.61832406061,\n",
628
- " 35.01016800533,\n",
629
- " 36.253728302115,\n",
630
- " 36.525842152615,\n",
631
- " 37.07823326913,\n",
632
- " 37.79661383445,\n",
633
- " 38.30818787339,\n",
634
- " 39.10276031685,\n",
635
- " 39.334057089775,\n",
636
- " 39.8918904833,\n",
637
- " 40.716395450315,\n",
638
- " 41.353141860485,\n",
639
- " 42.40622246192,\n",
640
- " 42.92323877787,\n",
641
- " 44.11509744306,\n",
642
- " 44.376326739540005,\n",
643
- " 44.47428772572,\n",
644
- " 45.785876485130004,\n",
645
- " 46.436228587825,\n",
646
- " 46.78725545497,\n",
647
- " 47.2389644468,\n",
648
- " 48.0553059983,\n",
649
- " 48.253949109165006,\n",
650
- " 48.564158898735,\n",
651
- " 48.760080871095,\n",
652
- " 49.25804921751,\n",
653
- " 49.33424109565,\n",
654
- " 49.96554522881,\n",
655
- " 50.25398591034,\n",
656
- " 50.47711926775,\n",
657
- " 50.931549398085,\n",
658
- " 51.42407546749,\n",
659
- " 51.72340070304,\n",
660
- " 52.23497474198,\n",
661
- " 52.4635503764,\n",
662
- " 53.149277279660005,\n",
663
- " 53.364247221555004,\n",
664
- " 53.658130180095,\n",
665
- " 54.36290505289,\n",
666
- " 54.449981485049996,\n",
667
- " 54.934344138940006,\n",
668
- " 55.064958787180004,\n",
669
- " 55.367005161235,\n",
670
- " 55.731637720905,\n",
671
- " 56.06633775702,\n",
672
- " 56.85002564646,\n",
673
- " 57.258196422210005,\n",
674
- " 57.56296393477,\n",
675
- " 58.38202662477501,\n",
676
- " 59.105849467104996,\n",
677
- " 59.312655993485,\n",
678
- " 59.48952999631,\n",
679
- " 59.70994221521501,\n",
680
- " 59.971171511695,\n",
681
- " 60.226958531164996,\n",
682
- " 60.504514658675,\n",
683
- " 61.26915457857999,\n",
684
- " 62.504551459850006,\n",
685
- " 63.127692177495,\n",
686
- " 64.17533050192,\n",
687
- " 64.983508637905,\n",
688
- " 66.240674627215,\n",
689
- " 67.084227563765,\n",
690
- " 67.74002194347,\n",
691
- " 68.384931769155,\n",
692
- " 69.146850550555,\n",
693
- " 70.43394906342,\n",
694
- " 71.15777190575001,\n",
695
- " 72.19724681466,\n",
696
- " 72.591811897885,\n",
697
- " 72.934675349515,\n",
698
- " 73.34556726377,\n",
699
- " 74.393205588195,\n",
700
- " 75.606833361425,\n",
701
- " 75.759217117705,\n",
702
- " 77.27489126499,\n",
703
- " 79.318466282245,\n",
704
- " 80.89400547664,\n",
705
- " 82.33620888429,\n",
706
- " 82.57022679572,\n",
707
- " 83.75120090688999,\n",
708
- " 84.93761729507,\n",
709
- " 87.28868096339001,\n",
710
- " 108.59519545754,\n",
711
- " 109.6618817515,\n",
712
- " 109.963928125555,\n",
713
- " 111.476881134335,\n",
714
- " 113.16670814594,\n",
715
- " 114.475575766845,\n",
716
- " 115.455185628645,\n",
717
- " 116.527314199615,\n",
718
- " 117.46610698383999,\n",
719
- " 120.60630081861,\n",
720
- " 121.550535879845,\n",
721
- " 121.89612046997999,\n",
722
- " 122.69069291344,\n",
723
- " 127.37105114203999]],\n",
724
- " 'mo-count': 244,\n",
725
- " 'basis-count': 244,\n",
726
- " 'multiplicity': 1,\n",
727
- " 'molecular-mass': 203.23557999999983,\n",
728
- " 'number-of-atoms': 31,\n",
729
- " 'lowdin-partial-charges': [-0.459759,\n",
730
- " 0.210106,\n",
731
- " -0.286001,\n",
732
- " -0.204529,\n",
733
- " 0.007889,\n",
734
- " -0.335887,\n",
735
- " 0.111477,\n",
736
- " -0.468258,\n",
737
- " -0.356875,\n",
738
- " -0.234623,\n",
739
- " 0.097362,\n",
740
- " -0.372531,\n",
741
- " -0.388717,\n",
742
- " -0.348849,\n",
743
- " 0.18522,\n",
744
- " 0.178539,\n",
745
- " 0.177997,\n",
746
- " 0.155744,\n",
747
- " 0.158006,\n",
748
- " 0.158613,\n",
749
- " 0.156412,\n",
750
- " 0.217594,\n",
751
- " 0.202351,\n",
752
- " 0.166462,\n",
753
- " 0.165118,\n",
754
- " 0.174338,\n",
755
- " 0.164152,\n",
756
- " 0.228815,\n",
757
- " 0.170737,\n",
758
- " 0.199278,\n",
759
- " 0.169819],\n",
760
- " 'mulliken-partial-charges': [-0.542286,\n",
761
- " 0.622923,\n",
762
- " -0.486172,\n",
763
- " -0.478169,\n",
764
- " 0.118421,\n",
765
- " -0.37835,\n",
766
- " 0.548339,\n",
767
- " -0.621169,\n",
768
- " -0.538256,\n",
769
- " -0.190811,\n",
770
- " -0.34019,\n",
771
- " -0.352855,\n",
772
- " -0.380417,\n",
773
- " -0.330181,\n",
774
- " 0.196453,\n",
775
- " 0.19234,\n",
776
- " 0.183625,\n",
777
- " 0.162753,\n",
778
- " 0.149387,\n",
779
- " 0.154003,\n",
780
- " 0.163522,\n",
781
- " 0.255434,\n",
782
- " 0.260419,\n",
783
- " 0.179949,\n",
784
- " 0.177859,\n",
785
- " 0.197429,\n",
786
- " 0.171353,\n",
787
- " 0.288802,\n",
788
- " 0.188726,\n",
789
- " 0.241415,\n",
790
- " 0.185706],\n",
791
- " 'dipole-moment': 11.419443262233626,\n",
792
- " 'pubchem-multiplicity': 1,\n",
793
- " 'pubchem-obabel-canonical-smiles': '[O-]C(=O)CC(C[N+](C)(C)C)OC(=O)C',\n",
794
- " 'pubchem-isomeric-smiles': 'CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C',\n",
795
- " 'pubchem-molecular-weight': 203.23558,\n",
796
- " 'pubchem-molecular-formula': 'C9H17NO4'},\n",
797
- " {'cid': 3,\n",
798
- " 'state': 'S0',\n",
799
- " 'pubchem-inchi': 'InChI=1S/C7H8O4/c8-5-3-1-2-4(6(5)9)7(10)11/h1-3,5-6,8-9H,(H,10,11)',\n",
800
- " 'pubchem-charge': 0,\n",
801
- " 'pubchem-version': '20160829',\n",
802
- " 'name': '[email protected]',\n",
803
- " 'coordinates': [1.040909924594702,\n",
804
- " 0.939649931940047,\n",
805
- " 0.08098999410797753,\n",
806
- " 1.3650499010782415,\n",
807
- " -0.3605999738883242,\n",
808
- " 0.0714099948350763,\n",
809
- " 0.34219997520772777,\n",
810
- " -1.4541898946486875,\n",
811
- " -0.08665999372343035,\n",
812
- " -1.141219917312014,\n",
813
- " -0.9958599278305537,\n",
814
- " -0.01889999864761909,\n",
815
- " -1.3611799013969879,\n",
816
- " 0.49082996446293237,\n",
817
- " -0.07803999435546188,\n",
818
- " -0.34924997468982877,\n",
819
- " 1.3769399002199543,\n",
820
- " -0.04337999683195858,\n",
821
- " -2.774219799024303,\n",
822
- " 0.9673199298928236,\n",
823
- " -0.17327998747066597,\n",
824
- " -3.142559772328053,\n",
825
- " 1.9693498573058525,\n",
826
- " -0.7360399466708769,\n",
827
- " -3.7237097301883524,\n",
828
- " 0.20901998488060394,\n",
829
- " 0.4530499671557916,\n",
830
- " -1.7447198735923888,\n",
831
- " -1.499439891346727,\n",
832
- " 1.1895999138141986,\n",
833
- " 0.5514599600513355,\n",
834
- " -2.1256198459830777,\n",
835
- " -1.3319699035140309,\n",
836
- " 1.795109869930254,\n",
837
- " 1.7233998751149642,\n",
838
- " 0.18157998687032353,\n",
839
- " 2.396099826407926,\n",
840
- " -0.7108599484797881,\n",
841
- " 0.1488699892182084,\n",
842
- " 0.5035599635280761,\n",
843
- " -2.278269834909643,\n",
844
- " 0.6579799523273178,\n",
845
- " -1.7243698750404042,\n",
846
- " -1.534139888846425,\n",
847
- " -0.8135899410798237,\n",
848
- " -0.5331099613674413,\n",
849
- " 2.4564898220333253,\n",
850
- " -0.10719999224171652,\n",
851
- " -3.3772997552900454,\n",
852
- " -0.6197999550994765,\n",
853
- " 0.9207699332805251,\n",
854
- " -1.3247099040246133,\n",
855
- " -1.098319920406063,\n",
856
- " 1.9847398562167238,\n",
857
- " 0.5780199581090726,\n",
858
- " -1.4850598923904088,\n",
859
- " -2.077949849455232],\n",
860
- " 'atomic-numbers': [6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 1, 1, 1, 1, 1, 1, 1, 1],\n",
861
- " 'atom-count': 19,\n",
862
- " 'heavy-atom-count': 11,\n",
863
- " 'core-electrons': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
864
- " 'bond-order': [1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1],\n",
865
- " 'connection-indices': [19,\n",
866
- " 11,\n",
867
- " 11,\n",
868
- " 3,\n",
869
- " 15,\n",
870
- " 4,\n",
871
- " 8,\n",
872
- " 7,\n",
873
- " 7,\n",
874
- " 5,\n",
875
- " 7,\n",
876
- " 9,\n",
877
- " 16,\n",
878
- " 6,\n",
879
- " 3,\n",
880
- " 4,\n",
881
- " 3,\n",
882
- " 2,\n",
883
- " 3,\n",
884
- " 14,\n",
885
- " 5,\n",
886
- " 6,\n",
887
- " 5,\n",
888
- " 4,\n",
889
- " 6,\n",
890
- " 1,\n",
891
- " 4,\n",
892
- " 10,\n",
893
- " 2,\n",
894
- " 1,\n",
895
- " 2,\n",
896
- " 13,\n",
897
- " 1,\n",
898
- " 12,\n",
899
- " 9,\n",
900
- " 17,\n",
901
- " 10,\n",
902
- " 18],\n",
903
- " 'formula': 'C7H8O4',\n",
904
- " 'version': '1.0',\n",
905
- " 'obabel-inchi': 'InChI=1S/C7H8O4/c8-5-3-1-2-4(6(5)9)7(10)11/h1-3,5-6,8-9H,(H,10,11)/t5-,6-/m0/s1',\n",
906
- " 'pm6-obabel-canonical-smiles': 'O[C@H]1C=CC=C([C@@H]1O)C(=O)O',\n",
907
- " 'charge': 0,\n",
908
- " 'energy-beta-gap': 4.58783951943,\n",
909
- " 'energy-beta-homo': -6.72121210735,\n",
910
- " 'energy-beta-lumo': -2.1333725879200003,\n",
911
- " 'energy-alpha-gap': 4.58783951943,\n",
912
- " 'energy-alpha-homo': -6.72121210735,\n",
913
- " 'energy-alpha-lumo': -2.1333725879200003,\n",
914
- " 'total-energy': -15575.751067250567,\n",
915
- " 'homos': [40],\n",
916
- " 'orbital-energies': [[-522.17559455548,\n",
917
- " -521.66402051654,\n",
918
- " -521.49530992923,\n",
919
- " -520.37420086517,\n",
920
- " -280.601081497095,\n",
921
- " -279.501741541075,\n",
922
- " -279.37656916984497,\n",
923
- " -278.26906579831,\n",
924
- " -278.14389342708,\n",
925
- " -278.108518626515,\n",
926
- " -278.059538133425,\n",
927
- " -29.358363330445,\n",
928
- " -28.729780335790004,\n",
929
- " -28.117524172165,\n",
930
- " -27.06988584774,\n",
931
- " -23.401791143,\n",
932
- " -21.271139693585,\n",
933
- " -20.48473066564,\n",
934
- " -17.896927947385,\n",
935
- " -17.79080354569,\n",
936
- " -16.2724082599,\n",
937
- " -15.589402495144999,\n",
938
- " -14.906396730389998,\n",
939
- " -13.657394156595,\n",
940
- " -13.450587630215,\n",
941
- " -13.390722583105,\n",
942
- " -13.254665657855,\n",
943
- " -12.04375902313,\n",
944
- " -11.777087449640002,\n",
945
- " -11.567559784755,\n",
946
- " -11.360753258375,\n",
947
- " -10.639651554550001,\n",
948
- " -10.359374288535,\n",
949
- " -9.72534901687,\n",
950
- " -9.51854249049,\n",
951
- " -9.32806279514,\n",
952
- " -8.721248908525,\n",
953
- " -8.114435021910001,\n",
954
- " -7.71714880018,\n",
955
- " -7.344352824994999,\n",
956
- " -6.72121210735,\n",
957
- " -2.13337258792,\n",
958
- " 0.44626671482,\n",
959
- " 1.161926141635,\n",
960
- " 1.8775855684500002,\n",
961
- " 2.076228679315,\n",
962
- " 2.413649853935,\n",
963
- " 2.98781007849,\n",
964
- " 3.251760513475,\n",
965
- " 3.68442153577,\n",
966
- " 3.986467909825,\n",
967
- " 4.487157394745,\n",
968
- " 5.0395485112600005,\n",
969
- " 5.5946607662800005,\n",
970
- " 5.951129910435,\n",
971
- " 6.65046250622,\n",
972
- " 7.153873129645,\n",
973
- " 7.2763243623700005,\n",
974
- " 8.61512450683,\n",
975
- " 9.80698317202,\n",
976
- " 10.152567762155,\n",
977
- " 12.035595607615,\n",
978
- " 12.666899740775001,\n",
979
- " 13.662836433605,\n",
980
- " 14.027468993274999,\n",
981
- " 14.177131611050001,\n",
982
- " 14.81387802122,\n",
983
- " 14.881906483845002,\n",
984
- " 15.662873234780001,\n",
985
- " 15.910496838735,\n",
986
- " 16.08737084156,\n",
987
- " 16.367648107575,\n",
988
- " 16.628877404055,\n",
989
- " 17.075144118875,\n",
990
- " 17.93774502496,\n",
991
- " 18.03570601114,\n",
992
- " 18.634356482239998,\n",
993
- " 18.80850934656,\n",
994
- " 19.788119208359998,\n",
995
- " 20.174520876069998,\n",
996
- " 21.129640491325,\n",
997
- " 21.4969941895,\n",
998
- " 21.918770657775,\n",
999
- " 22.239865001365,\n",
1000
- " 22.462998358775,\n",
1001
- " 22.702458547215002,\n",
1002
- " 23.050764275855,\n",
1003
- " 23.23308055569,\n",
1004
- " 23.714722071075002,\n",
1005
- " 24.61541891623,\n",
1006
- " 25.178694586765,\n",
1007
- " 25.856258074510002,\n",
1008
- " 26.158304448565,\n",
1009
- " 26.479398792155,\n",
1010
- " 26.751512642655,\n",
1011
- " 27.094376094285,\n",
1012
- " 27.44540296143,\n",
1013
- " 27.878063983725,\n",
1014
- " 28.612771380075003,\n",
1015
- " 29.303940560345,\n",
1016
- " 29.546121887290003,\n",
1017
- " 29.777418660215,\n",
1018
- " 30.604644765735003,\n",
1019
- " 31.225064344875,\n",
1020
- " 32.193789652655,\n",
1021
- " 33.56796459768,\n",
1022
- " 33.97069309642,\n",
1023
- " 34.648256584165004,\n",
1024
- " 35.34214690294,\n",
1025
- " 36.14760390042,\n",
1026
- " 37.170751978300004,\n",
1027
- " 38.29730331937,\n",
1028
- " 39.21704813406,\n",
1029
- " 39.4565083225,\n",
1030
- " 39.611613217285,\n",
1031
- " 40.520473477955,\n",
1032
- " 42.817114376175,\n",
1033
- " 43.698763251795,\n",
1034
- " 44.65932514406,\n",
1035
- " 45.013073149709996,\n",
1036
- " 45.451176449015,\n",
1037
- " 45.56546426622501,\n",
1038
- " 46.16139359882,\n",
1039
- " 47.606318144975,\n",
1040
- " 48.06619055232,\n",
1041
- " 48.444428804515,\n",
1042
- " 48.977771951495,\n",
1043
- " 49.31791426462,\n",
1044
- " 49.837651719075,\n",
1045
- " 50.27031274137,\n",
1046
- " 50.87440548948,\n",
1047
- " 50.964203060145,\n",
1048
- " 51.268970572705,\n",
1049
- " 51.52203645367,\n",
1050
- " 53.07852767853,\n",
1051
- " 53.549284639895,\n",
1052
- " 54.07446437136,\n",
1053
- " 54.615970933854996,\n",
1054
- " 55.715310889875,\n",
1055
- " 56.466345117255,\n",
1056
- " 56.75206466028,\n",
1057
- " 58.011951788095,\n",
1058
- " 58.882716109695,\n",
1059
- " 59.742595877275,\n",
1060
- " 60.22423739266,\n",
1061
- " 61.85147821865,\n",
1062
- " 62.877347435035006,\n",
1063
- " 63.394363750985,\n",
1064
- " 63.987571945075004,\n",
1065
- " 64.38213702830001,\n",
1066
- " 65.571274554985,\n",
1067
- " 66.46380798462499,\n",
1068
- " 67.13592919536,\n",
1069
- " 68.07472197958501,\n",
1070
- " 68.40670087719501,\n",
1071
- " 69.08970664195,\n",
1072
- " 70.104691304315,\n",
1073
- " 71.23124264538501,\n",
1074
- " 71.70199960675,\n",
1075
- " 72.88297371792,\n",
1076
- " 73.11699162935,\n",
1077
- " 73.593190867725,\n",
1078
- " 76.11568626186,\n",
1079
- " 76.92114325934,\n",
1080
- " 77.664014071205,\n",
1081
- " 78.45586537616,\n",
1082
- " 81.000129878335,\n",
1083
- " 83.051868311105,\n",
1084
- " 84.559379042875,\n",
1085
- " 88.62475996934499,\n",
1086
- " 101.659013408295,\n",
1087
- " 103.759732334155,\n",
1088
- " 105.58561627101,\n",
1089
- " 110.20066717549001,\n",
1090
- " 112.26873243929,\n",
1091
- " 114.1245488997,\n",
1092
- " 116.195335302005,\n",
1093
- " 119.86070886824,\n",
1094
- " 120.57364715655001,\n",
1095
- " 122.5056554951,\n",
1096
- " 128.723456979025]],\n",
1097
- " 'mo-count': 181,\n",
1098
- " 'basis-count': 181,\n",
1099
- " 'multiplicity': 1,\n",
1100
- " 'molecular-mass': 156.13601999999992,\n",
1101
- " 'number-of-atoms': 19,\n",
1102
- " 'lowdin-partial-charges': [-0.165834,\n",
1103
- " -0.166217,\n",
1104
- " -0.00709,\n",
1105
- " -0.013209,\n",
1106
- " -0.079093,\n",
1107
- " -0.123254,\n",
1108
- " 0.188072,\n",
1109
- " -0.245033,\n",
1110
- " -0.385089,\n",
1111
- " -0.452317,\n",
1112
- " -0.452864,\n",
1113
- " 0.167807,\n",
1114
- " 0.16777,\n",
1115
- " 0.161347,\n",
1116
- " 0.170277,\n",
1117
- " 0.178644,\n",
1118
- " 0.358744,\n",
1119
- " 0.351181,\n",
1120
- " 0.346156],\n",
1121
- " 'mulliken-partial-charges': [-0.122757,\n",
1122
- " -0.14337,\n",
1123
- " 0.076825,\n",
1124
- " 0.022155,\n",
1125
- " 0.032184,\n",
1126
- " -0.158536,\n",
1127
- " 0.54755,\n",
1128
- " -0.448879,\n",
1129
- " -0.6056,\n",
1130
- " -0.631763,\n",
1131
- " -0.61658,\n",
1132
- " 0.149995,\n",
1133
- " 0.155257,\n",
1134
- " 0.162402,\n",
1135
- " 0.184383,\n",
1136
- " 0.175571,\n",
1137
- " 0.426152,\n",
1138
- " 0.402121,\n",
1139
- " 0.392891],\n",
1140
- " 'dipole-moment': 5.099240805823648,\n",
1141
- " 'pubchem-multiplicity': 1,\n",
1142
- " 'pubchem-obabel-canonical-smiles': 'OC1C=CC=C(C1O)C(=O)O',\n",
1143
- " 'pubchem-isomeric-smiles': 'C1=CC(C(C(=C1)C(=O)O)O)O',\n",
1144
- " 'pubchem-molecular-weight': 156.13602,\n",
1145
- " 'pubchem-molecular-formula': 'C7H8O4'}]"
1146
- ]
1147
- },
1148
- "execution_count": 15,
1149
- "metadata": {},
1150
- "output_type": "execute_result"
1151
- }
1152
- ],
1153
  "source": [
1154
  "list(hub_dataset.take(2))"
1155
  ]
 
17
  },
18
  {
19
  "cell_type": "code",
20
+ "execution_count": null,
21
  "metadata": {},
22
  "outputs": [],
23
  "source": [
 
35
  },
36
  {
37
  "cell_type": "code",
38
+ "execution_count": null,
39
  "metadata": {},
40
  "outputs": [],
41
  "source": [
 
63
  },
64
  {
65
  "cell_type": "code",
66
+ "execution_count": null,
67
  "metadata": {},
68
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  "source": [
70
  "# load the dataset\n",
71
  "hub_dataset = load_dataset(path=path,\n",
 
121
  },
122
  {
123
  "cell_type": "code",
124
+ "execution_count": null,
125
  "metadata": {},
126
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  "source": [
128
  "# print the column names\n",
129
  "hub_dataset.column_names"
 
141
  },
142
  {
143
  "cell_type": "code",
144
+ "execution_count": null,
145
  "metadata": {},
146
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  "source": [
148
  "list(hub_dataset.take(2))"
149
  ]