XThomasBU
commited on
Commit
·
5cd7fa4
1
Parent(s):
34aaae9
remove hardcoded
Browse files- README.md +1 -1
- code/modules/dataloader/data_loader.py +13 -3
README.md
CHANGED
@@ -37,7 +37,7 @@ Please visit [setup](https://dl4ds.github.io/dl4ds_tutor/guide/setup/) for more
|
|
37 |
3. **To test Data Loading (Optional)**
|
38 |
```bash
|
39 |
cd code
|
40 |
-
python -m modules.dataloader.data_loader
|
41 |
```
|
42 |
|
43 |
4. **Create the Vector Database**
|
|
|
37 |
3. **To test Data Loading (Optional)**
|
38 |
```bash
|
39 |
cd code
|
40 |
+
python -m modules.dataloader.data_loader --links "your_pdf_link"
|
41 |
```
|
42 |
|
43 |
4. **Create the Vector Database**
|
code/modules/dataloader/data_loader.py
CHANGED
@@ -417,6 +417,18 @@ class DataLoader:
|
|
417 |
|
418 |
if __name__ == "__main__":
|
419 |
import yaml
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
420 |
|
421 |
logger = logging.getLogger(__name__)
|
422 |
logger.setLevel(logging.INFO)
|
@@ -445,9 +457,7 @@ if __name__ == "__main__":
|
|
445 |
documents,
|
446 |
document_metadata,
|
447 |
) = data_loader.get_chunks(
|
448 |
-
|
449 |
-
"https://dl4ds.github.io/fa2024/static_files/discussion_slides/00_discussion.pdf"
|
450 |
-
],
|
451 |
[],
|
452 |
)
|
453 |
|
|
|
417 |
|
418 |
if __name__ == "__main__":
|
419 |
import yaml
|
420 |
+
import argparse
|
421 |
+
|
422 |
+
parser = argparse.ArgumentParser(description="Process some links.")
|
423 |
+
parser.add_argument(
|
424 |
+
'--links',
|
425 |
+
nargs='+',
|
426 |
+
required=True,
|
427 |
+
help="List of links to process."
|
428 |
+
)
|
429 |
+
|
430 |
+
args = parser.parse_args()
|
431 |
+
links_to_process = args.links
|
432 |
|
433 |
logger = logging.getLogger(__name__)
|
434 |
logger.setLevel(logging.INFO)
|
|
|
457 |
documents,
|
458 |
document_metadata,
|
459 |
) = data_loader.get_chunks(
|
460 |
+
links_to_process,
|
|
|
|
|
461 |
[],
|
462 |
)
|
463 |
|