Upload get_articles.py
Browse filescsv preparor with dates
- get_articles.py +14 -3
get_articles.py
CHANGED
@@ -3,7 +3,7 @@ import os
|
|
3 |
import csv
|
4 |
from sentence_transformers import SentenceTransformer, util
|
5 |
import torch
|
6 |
-
|
7 |
from get_keywords import get_keywords
|
8 |
import os
|
9 |
|
@@ -74,7 +74,7 @@ def save_solr_articles(keywords: str, num_articles=15) -> str:
|
|
74 |
os.makedirs(os.path.dirname(save_path))
|
75 |
|
76 |
with open(save_path, 'w', newline='') as csvfile:
|
77 |
-
fieldnames = ['title', 'uuid', 'content', 'url', 'domain']
|
78 |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC)
|
79 |
writer.writeheader()
|
80 |
|
@@ -110,9 +110,20 @@ def save_solr_articles(keywords: str, num_articles=15) -> str:
|
|
110 |
else:
|
111 |
domain = d['domain']
|
112 |
print(domain)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
writer.writerow({'title': title_cleaned, 'uuid': d['uuid'], 'content': cleaned_content, 'url': d['url'],
|
115 |
-
'domain': domain})
|
|
|
116 |
return save_path
|
117 |
|
118 |
|
|
|
3 |
import csv
|
4 |
from sentence_transformers import SentenceTransformer, util
|
5 |
import torch
|
6 |
+
from datetime import datetime
|
7 |
from get_keywords import get_keywords
|
8 |
import os
|
9 |
|
|
|
74 |
os.makedirs(os.path.dirname(save_path))
|
75 |
|
76 |
with open(save_path, 'w', newline='') as csvfile:
|
77 |
+
fieldnames = ['title', 'uuid', 'content', 'url', 'domain','published_date']
|
78 |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC)
|
79 |
writer.writeheader()
|
80 |
|
|
|
110 |
else:
|
111 |
domain = d['domain']
|
112 |
print(domain)
|
113 |
+
raw_date = d.get('year_month_day', "Unknown Date")
|
114 |
+
|
115 |
+
# Format the date from YYYY-MM-DD to MM/DD/YYYY if available
|
116 |
+
if raw_date != "Unknown Date":
|
117 |
+
try:
|
118 |
+
publication_date = datetime.strptime(raw_date, "%Y-%m-%d").strftime("%m/%d/%Y")
|
119 |
+
except ValueError:
|
120 |
+
publication_date = "Invalid Date"
|
121 |
+
else:
|
122 |
+
publication_date = raw_date
|
123 |
|
124 |
writer.writerow({'title': title_cleaned, 'uuid': d['uuid'], 'content': cleaned_content, 'url': d['url'],
|
125 |
+
'domain': domain, 'published_date': publication_date})
|
126 |
+
print(f"Article saved: {title_cleaned}, {d['uuid']}, {cleaned_content}, {d['url']}, {domain}, {publication_date}")
|
127 |
return save_path
|
128 |
|
129 |
|