DrishtiSharma commited on
Commit
3491a63
·
verified ·
1 Parent(s): b6bff6f

Create preprocess_data.py

Browse files
Files changed (1) hide show
  1. patentwiz/preprocess_data.py +272 -0
patentwiz/preprocess_data.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import zipfile
4
+ import xml.etree.ElementTree as ET
5
+ from datetime import datetime, timedelta
6
+ import pickle
7
+
8
+
9
+ def download_weekly_patents(year, month, day, logging):
10
+ """
11
+ Download weekly patent files from the USPTO website based on a specific date.
12
+ Parameters:
13
+ year (int): The year of the patent.
14
+ month (int): The month of the patent.
15
+ day (int): The day of the patent.
16
+ logging (bool): The boolean to print logs
17
+ Returns:
18
+ bool: True if the download is successful, False otherwise.
19
+ """
20
+
21
+ # Check if the "data" folder exists and create one if it doesn't
22
+ data_folder = os.path.join(os.getcwd(), "data")
23
+ if not os.path.exists(data_folder):
24
+ if logging:
25
+ print("Data folder not found. Creating a new 'data' folder.")
26
+ os.makedirs(data_folder)
27
+
28
+ directory = os.path.join(
29
+ os.getcwd(), "data", "ipa" + str(year)[2:] + f"{month:02d}" + f"{day:02d}"
30
+ )
31
+
32
+ if os.path.exists(directory):
33
+ print(f"File {directory} already exists. Skipping download.")
34
+ return True
35
+
36
+ if logging:
37
+ print("Building the URL...")
38
+ base_url = "https://bulkdata.uspto.gov/data/patent/application/redbook/fulltext"
39
+ file_url = (
40
+ base_url
41
+ + "/"
42
+ + str(year)
43
+ + "/ipa"
44
+ + str(year)[2:]
45
+ + f"{month:02d}"
46
+ + f"{day:02d}"
47
+ + ".zip"
48
+ )
49
+
50
+ if logging:
51
+ print(f"URL constructed: {file_url}")
52
+ r = requests.get(file_url, stream=True)
53
+
54
+ if logging:
55
+ print("Requesting the file...")
56
+ if r.status_code == 200:
57
+ if logging:
58
+ print("File retrieved successfully. Starting download...")
59
+ local_path = os.path.join(os.getcwd(), "data", "patents.zip")
60
+
61
+ with open(local_path, "wb") as f:
62
+ for chunk in r.iter_content(chunk_size=1024):
63
+ if chunk:
64
+ f.write(chunk)
65
+ if logging:
66
+ print("File downloaded successfully. Starting extraction...")
67
+ with zipfile.ZipFile(local_path, "r") as zip_ref:
68
+ zip_ref.extractall(os.path.join(os.getcwd(), "data"))
69
+
70
+ if logging:
71
+ print("File extracted successfully.")
72
+ # Deleting the ZIP file after extraction
73
+ os.remove(local_path)
74
+ if logging:
75
+ print(f"ZIP file {local_path} deleted after extraction.")
76
+
77
+ return True
78
+ else:
79
+ print(
80
+ "File could not be downloaded. Please make sure the year, month, and day are correct."
81
+ )
82
+ return False
83
+
84
+ def filter_rf_patents(patents, keywords=None):
85
+ """
86
+ Filters patents based on RF-related keywords.
87
+ Parameters:
88
+ patents (list): A list of patent descriptions.
89
+ keywords (list): Keywords to filter patents.
90
+ Returns:
91
+ list: Filtered patents containing RF-related keywords.
92
+ """
93
+ if keywords is None:
94
+ keywords = [
95
+ "RF", "Radio Frequency", "Wireless Communication", "Microwave Antenna", " Microwave Devices", "Antenna",
96
+ "Antenna Array", "Beamforming", "Analog beamforming", "Digital Beamforming", "Radiation Pattern",
97
+ "Antenna Gain", "Antenna Bandwidth", "5G", "6G", "IoT", "RF Spectrum", "RF Filter", "Power Amplifier",
98
+ "High Frequency Oscillator", "RF Mixer", "RF Coupler", "Standing Wave Ratio", "SWR",
99
+ "Voltage Standing Wave Ratio", "VSWR", "Transceiver", "Patch Antenna", "Parabolic Antenna", "Dipole Antenna",
100
+ "Directional Coupler", "Phased Array Antenna", "Wireless Charging", "Inductive charging", "Transparent Antenna",
101
+ "Scattering Parameters", "Smith Chart"
102
+ ]
103
+ rf_patents = [
104
+ patent for patent in patents if any(keyword.lower() in patent.lower() for keyword in keywords)
105
+ ]
106
+ return rf_patents
107
+
108
+ def extract_patents(year, month, day, logging):
109
+ """
110
+ This function reads a patent file in XML format, splits it into individual patents, parses each
111
+ XML file, and saves each patent as a separate txt file in a directory named 'data'.
112
+ Parameters:
113
+ year (int): The year of the patent file to process.
114
+ month (int): The month of the patent file to process.
115
+ day (int): The day of the patent file to process.
116
+ logging (bool): The boolean to print logs
117
+ Returns:
118
+ None
119
+ The function creates a separate XML file for each patent and stores these files in
120
+ a directory. The directory is named based on the year, month, and day provided.
121
+ If the directory does not exist, the function creates it. The function also prints
122
+ the total number of patents found.
123
+ """
124
+
125
+ directory = os.path.join(
126
+ os.getcwd(), "data", "ipa" + str(year)[2:] + f"{month:02d}" + f"{day:02d}"
127
+ )
128
+ saved_patent_names_path = os.path.join(directory, 'saved_patent_names.pkl')
129
+
130
+ if os.path.exists(directory):
131
+ print(f"File {directory} already exists. Skipping extract.")
132
+
133
+ # Load saved_patent_names from file
134
+ with open(saved_patent_names_path, 'rb') as f:
135
+ saved_patent_names = pickle.load(f)
136
+
137
+ return saved_patent_names
138
+ else:
139
+ os.mkdir(directory)
140
+
141
+ if logging:
142
+ print("Locating the patent file...")
143
+ file_path = os.path.join(
144
+ os.getcwd(),
145
+ "data",
146
+ "ipa" + str(year)[2:] + f"{month:02d}" + f"{day:02d}" + ".xml",
147
+ )
148
+
149
+ if logging:
150
+ print("Reading the patent file...")
151
+ with open(file_path, "r") as f:
152
+ contents = f.read()
153
+
154
+ if logging:
155
+ print("Splitting the XML file into individual XMLs...")
156
+ temp = contents.split('<?xml version="1.0" encoding="UTF-8"?>')
157
+ allXmls = [
158
+ '<?xml version="1.0" encoding="UTF-8"?>' + s.replace("\n", "") for s in temp
159
+ ]
160
+
161
+ # saving only the XMLs that contain a patent
162
+ patents = []
163
+ for xml_string in allXmls:
164
+ start_index = xml_string.find("<!DOCTYPE")
165
+ end_index = xml_string.find(">", start_index)
166
+
167
+ if start_index != -1 and end_index != -1:
168
+ doctype_declaration = xml_string[start_index : end_index + 1]
169
+ # Extract only the name of the DOCTYPE
170
+ doctype_name = doctype_declaration.split()[1]
171
+ if doctype_name == "us-patent-application":
172
+ patents.append(xml_string)
173
+
174
+ if logging:
175
+ print(f"Total patents found: {len(patents)}")
176
+ print("Writing individual patents to separate txt files...")
177
+
178
+ saved_patent_names = []
179
+ for patent in patents:
180
+ try:
181
+ root = ET.fromstring(patent)
182
+
183
+ patent_id = root.find(
184
+ ".//publication-reference/document-id/doc-number"
185
+ ).text
186
+ file_id = root.attrib["file"]
187
+
188
+ ipcr_classifications = root.findall(".//classification-ipcr")
189
+
190
+ if any(ipcr.find("./section").text == "C" for ipcr in ipcr_classifications):
191
+ description_element = root.find(".//description")
192
+ description_text = get_full_text(description_element)
193
+
194
+ # Filter RF-relevant content
195
+ filtered_description = filter_rf_patents(description_text)
196
+ if filtered_description:
197
+ description_string = " ".join(filtered_description)
198
+ output_file_path = os.path.join(directory, f"{file_id}.txt")
199
+ with open(output_file_path, "w") as f:
200
+ f.write(description_string)
201
+ saved_patent_names.append(f"{file_id}.txt")
202
+
203
+ elif logging:
204
+ print(
205
+ f"Patent {patent_id} does not belong to section 'C'. Skipping this patent."
206
+ )
207
+ except ET.ParseError as e:
208
+ print(f"Error while parsing patent: {patent_id}. Skipping this patent.")
209
+ print(f"Error message: {e}")
210
+
211
+ # Save saved_patent_names to file
212
+ with open(saved_patent_names_path, 'wb') as f:
213
+ pickle.dump(saved_patent_names, f)
214
+
215
+ if logging:
216
+ print("Patent extraction complete.")
217
+
218
+ # Deleting the main XML file after extraction
219
+ os.remove(file_path)
220
+
221
+ if logging:
222
+ print(f"Main XML file {file_path} deleted after extraction.")
223
+ return saved_patent_names
224
+
225
+
226
+ def get_full_text(element):
227
+ """
228
+ Recursively parse XML elements and retrieve the full text from the XML tree.
229
+ Parameters:
230
+ element (xml.etree.ElementTree.Element): The root XML element to start parsing.
231
+ Returns:
232
+ list: A list of strings containing the full text from the XML element and its children.
233
+ """
234
+
235
+ text = []
236
+ if element.text is not None and element.text.strip():
237
+ text.append(element.text.strip())
238
+ for child in element:
239
+ text.extend(get_full_text(child))
240
+ if child.tail is not None and child.tail.strip():
241
+ text.append(child.tail.strip())
242
+ return text
243
+
244
+
245
+ def parse_and_save_patents(start_date, end_date, logging=False):
246
+ """
247
+ Download weekly patent files from the USPTO website for a range of dates, extract individual
248
+ patents from the downloaded file, parse each patent's content, and save the information
249
+ as separate text files.
250
+ Parameters:
251
+ start_date (datetime): The start date of the range.
252
+ end_date (datetime): The end date of the range.
253
+ logging (bool): The boolean to print logs
254
+ Returns:
255
+ list: A list of strings containing the names of saved patent text files.
256
+ """
257
+ all_saved_patent_names = []
258
+
259
+ current_date = start_date
260
+ while current_date <= end_date:
261
+ year, month, day = current_date.year, current_date.month, current_date.day
262
+ if logging:
263
+ print(f"Processing patents for {current_date.strftime('%Y-%m-%d')}...")
264
+
265
+ download_success = download_weekly_patents(year, month, day, logging)
266
+ if download_success:
267
+ saved_patent_names = extract_patents(year, month, day, logging)
268
+ all_saved_patent_names.extend(saved_patent_names)
269
+
270
+ current_date += timedelta(days=7) # USPTO weekly files are organized by week
271
+
272
+ return all_saved_patent_names