Spaces:

mdredze1
/

tobacco-watcher-chat-with-citations

Sleeping

vtiyyal1 commited on Nov 22, 2024

Commit

bfa79fd

verified ·

1 Parent(s): f8e7b59

Upload get_articles.py

csv preparor with dates

Files changed (1) hide show

get_articles.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 import csv
 from sentence_transformers import SentenceTransformer, util
 import torch
 from get_keywords import get_keywords
 import os
@@ -74,7 +74,7 @@ def save_solr_articles(keywords: str, num_articles=15) -> str:
         os.makedirs(os.path.dirname(save_path))
     with open(save_path, 'w', newline='') as csvfile:
-        fieldnames = ['title', 'uuid', 'content', 'url', 'domain']
         writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC)
         writer.writeheader()
@@ -110,9 +110,20 @@ def save_solr_articles(keywords: str, num_articles=15) -> str:
             else:
                 domain = d['domain']
             print(domain)
             writer.writerow({'title': title_cleaned, 'uuid': d['uuid'], 'content': cleaned_content, 'url': d['url'],
-                             'domain': domain})
     return save_path

 import csv
 from sentence_transformers import SentenceTransformer, util
 import torch
+from datetime import datetime
 from get_keywords import get_keywords
 import os
         os.makedirs(os.path.dirname(save_path))
     with open(save_path, 'w', newline='') as csvfile:
+        fieldnames = ['title', 'uuid', 'content', 'url', 'domain','published_date']
         writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC)
         writer.writeheader()
             else:
                 domain = d['domain']
             print(domain)
+            raw_date = d.get('year_month_day', "Unknown Date")
+            # Format the date from YYYY-MM-DD to MM/DD/YYYY if available
+            if raw_date != "Unknown Date":
+                try:
+                    publication_date = datetime.strptime(raw_date, "%Y-%m-%d").strftime("%m/%d/%Y")
+                except ValueError:
+                    publication_date = "Invalid Date"
+            else:
+                publication_date = raw_date
             writer.writerow({'title': title_cleaned, 'uuid': d['uuid'], 'content': cleaned_content, 'url': d['url'],
+                             'domain': domain, 'published_date': publication_date})
+            print(f"Article saved: {title_cleaned}, {d['uuid']}, {cleaned_content}, {d['url']}, {domain}, {publication_date}")
     return save_path