vtiyyal1 commited on
Commit
bfa79fd
·
verified ·
1 Parent(s): f8e7b59

Upload get_articles.py

Browse files

csv preparor with dates

Files changed (1) hide show
  1. get_articles.py +14 -3
get_articles.py CHANGED
@@ -3,7 +3,7 @@ import os
3
  import csv
4
  from sentence_transformers import SentenceTransformer, util
5
  import torch
6
-
7
  from get_keywords import get_keywords
8
  import os
9
 
@@ -74,7 +74,7 @@ def save_solr_articles(keywords: str, num_articles=15) -> str:
74
  os.makedirs(os.path.dirname(save_path))
75
 
76
  with open(save_path, 'w', newline='') as csvfile:
77
- fieldnames = ['title', 'uuid', 'content', 'url', 'domain']
78
  writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC)
79
  writer.writeheader()
80
 
@@ -110,9 +110,20 @@ def save_solr_articles(keywords: str, num_articles=15) -> str:
110
  else:
111
  domain = d['domain']
112
  print(domain)
 
 
 
 
 
 
 
 
 
 
113
 
114
  writer.writerow({'title': title_cleaned, 'uuid': d['uuid'], 'content': cleaned_content, 'url': d['url'],
115
- 'domain': domain})
 
116
  return save_path
117
 
118
 
 
3
  import csv
4
  from sentence_transformers import SentenceTransformer, util
5
  import torch
6
+ from datetime import datetime
7
  from get_keywords import get_keywords
8
  import os
9
 
 
74
  os.makedirs(os.path.dirname(save_path))
75
 
76
  with open(save_path, 'w', newline='') as csvfile:
77
+ fieldnames = ['title', 'uuid', 'content', 'url', 'domain','published_date']
78
  writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC)
79
  writer.writeheader()
80
 
 
110
  else:
111
  domain = d['domain']
112
  print(domain)
113
+ raw_date = d.get('year_month_day', "Unknown Date")
114
+
115
+ # Format the date from YYYY-MM-DD to MM/DD/YYYY if available
116
+ if raw_date != "Unknown Date":
117
+ try:
118
+ publication_date = datetime.strptime(raw_date, "%Y-%m-%d").strftime("%m/%d/%Y")
119
+ except ValueError:
120
+ publication_date = "Invalid Date"
121
+ else:
122
+ publication_date = raw_date
123
 
124
  writer.writerow({'title': title_cleaned, 'uuid': d['uuid'], 'content': cleaned_content, 'url': d['url'],
125
+ 'domain': domain, 'published_date': publication_date})
126
+ print(f"Article saved: {title_cleaned}, {d['uuid']}, {cleaned_content}, {d['url']}, {domain}, {publication_date}")
127
  return save_path
128
 
129