H4CK3R-5M4CK3R commited on
Commit
7bed2d8
·
1 Parent(s): 0b55168

added multiple urls scrapper

Browse files
Files changed (3) hide show
  1. app.py +61 -27
  2. sciencedirect.py +2 -2
  3. sciencedirect_admaths.py +1 -1
app.py CHANGED
@@ -12,7 +12,6 @@ import os
12
  import random
13
  import string
14
  import json
15
- import base64
16
  import crypto
17
 
18
  auth = [
@@ -33,6 +32,8 @@ For bug reports or improvements, contact [@H4CK3R_5M4CK3R](https://t.me/H4CK3R_5
33
  2. **Multiple Issues Scraping:**
34
  - Use curly braces `{}` in the URL to indicate where the volume (`v`) and issue (`i`) numbers should be inserted.
35
  - Define the range for volumes, not issues. Ensure you pass the volume range correctly.
 
 
36
 
37
  3. **Read this before using google sheet feature**
38
  - **IMPORTANT** First make a google drive folder and then gave access to `[email protected]` and `[email protected]` super important.
@@ -55,16 +56,17 @@ If you are copying this space make sure to contact the owner as mention above
55
  """
56
 
57
  exmpl = [
58
- ["https://www.ams.org/journals/jams/2024-37-01/home.html?active=allissues", 0, 0, "[email protected]", "asdfasdfasdfasdfasdf", "example1", False, True],
59
- ["https://www.degruyter.com/journal/key/fca/20/2/html", 0, 0, "[email protected]", "asdfasdfasdfasdfasdf", "example2", False, True],
60
- ["https://www.degruyter.com/journal/key/fca/{v}/{i}/html", 22, 23, "[email protected]", "asdfasdfasdfasdfasdf", "example3", False, True],
61
- ["https://www.aimspress.com/math/article/2024/8/archive-articles", 0, 0, "[email protected]", "asdfasdfasdfasdfasdf", "example4", False, True],
62
- ["https://www.aimspress.com/math/article/{v}/{i}/archive-articles", 2021, 2022, "[email protected]", "asdfasdfasdfasdfasdf", "example5", False, True],
63
- ["https://link.springer.com/journal/208/volumes-and-issues/388-3", 0, 0, "[email protected]", "asdfasdfasdfasdfasdf", "example6", False, True],
64
- ["https://link.springer.com/journal/208/volumes-and-issues/{v}-{i}", 388, 389, "[email protected]", "asdfasdfasdfasdfasdf", "example7", False, True],
65
- ["https://www.sciencedirect.com/journal/advances-in-applied-mathematics/vol/158/suppl/C", 0, 0, "[email protected]", "asdfasdfasdfasdfasdf", "example8", False, True],
66
- ["https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/38/issue/6", 0, 0, "[email protected]", "asdfasdfasdfasdfasdf", "example9", False, True],
67
- ["https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{v}/issue/{i}", 37, 38, "[email protected]", "asdfasdfasdfasdfasdf", "example10", False, True]
 
68
  ]
69
 
70
  stop_work = False
@@ -73,7 +75,7 @@ def generate_random_filename(length=8):
73
  random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=length))
74
  return random_string
75
 
76
- def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, output: str, owner: bool, mail: str, folder_id: str, credit: dict, usegooglesheet: bool):
77
  if len(output.strip()) < 1:
78
  output = generate_random_filename()
79
  if os.path.exists(f"{output}.xlsx"):
@@ -84,8 +86,8 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
84
  shet = GoogleSheetAutomator(
85
  [
86
  "Name",
87
- "Email",
88
- "Address"
89
  ],
90
  folder_id,
91
  outputfile=output,
@@ -93,8 +95,8 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
93
  )
94
  sht = ExcelAutomator([
95
  "Name",
96
- "Email",
97
- "Address"
98
  ], output)
99
  filen = True
100
  if "{" in url:
@@ -266,6 +268,7 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
266
  yield {"final_output": sht.save_to_file(), "link" : ""}
267
 
268
  elif domain == "www.sciencedirect.com":
 
269
  for ur in links:
270
  isu = 1
271
  while True:
@@ -274,11 +277,19 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
274
  try:
275
  if filen:
276
  current_url = ur.format(i=isu)
277
- authors, _ = sciencedirect.run(current_url)
 
 
 
 
278
  isu += 1
279
  else:
280
  current_url = ur
281
- authors, _ = sciencedirect.run(current_url)
 
 
 
 
282
  except Exception as e:
283
  break
284
  yield {"current_url": current_url, "status": "fetching"}
@@ -303,6 +314,7 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
303
  yield {"final_output": sht.save_to_file(), "link" : ""}
304
 
305
  elif domain == "www.sciencedirect.com" and "acta-mathematica-scientia" in url:
 
306
  for ur in links:
307
  isu = 1
308
  while True:
@@ -311,11 +323,19 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
311
  try:
312
  if filen:
313
  current_url = ur.format(i=isu)
314
- authors, _ = sciencedirect_admaths.run(current_url)
 
 
 
 
315
  isu += 1
316
  else:
317
  current_url = ur
318
- authors, _ = sciencedirect_admaths.run(current_url)
 
 
 
 
319
  except Exception as e:
320
  break
321
  yield {"current_url": current_url, "status": "fetching"}
@@ -354,25 +374,35 @@ def handle_url(Link: str, Gmail: str, FolderId: str, Output: str, MakeOwner:bool
354
  authors = []
355
  details = []
356
  final_output = None
 
357
  link = None
358
  From_volume = 0
359
  To_Volume = 0
360
  urls = []
361
  if "\n" in Link:
362
  urls = Link.split("\n")
 
363
  else:
364
  urls.append(Link)
 
 
365
  for url in urls:
366
  if url.startswith("("):
367
- From_volume = url.split("(", 1)[1].split(":")[0]
368
- To_Volume = url.split("(", 1)[1].split(":")[1].split(")", 1)[0]
369
  url = url.split("-", 1)[1].strip()
370
- credit = crypto.decrypt(os.getenv("KEY"), os.getenv("GOOGLE_AUTH_CREDENTIALS"))
371
- credit = json.loads(credit)
 
 
 
372
 
373
- for _, result in enumerate(filterUrlandRun(url, From_volume, To_Volume, Reverse, Output, MakeOwner, Gmail, FolderId, credit=credit, usegooglesheet=UseGoogleSheet)):
374
  if "final_output" in result:
375
- final_output = result["final_output"]
 
 
 
376
  link = result["link"]
377
  else:
378
  if "author" in result:
@@ -383,11 +413,15 @@ def handle_url(Link: str, Gmail: str, FolderId: str, Output: str, MakeOwner:bool
383
  details.append(f"Scraping: {current_url}\n")
384
  authors = authors[-3:] if len(authors) > 3 else authors
385
  details = details[-3:] if len(details) > 3 else details
 
 
386
  yield "\n".join(authors), "\n".join(details), final_output if final_output else None, gr.Audio("notification.mp3", autoplay=True) if final_output else None, link
 
 
387
 
388
  interface = gr.Interface(
389
  fn=handle_url,
390
- inputs=["textbox", gr.Textbox(label="Access Gmail (Check Docs)"), gr.Textbox(label="Google Folder ID (Check Docs)"), gr.Textbox(label="Output File Name"), gr.Checkbox(True, label="Make Owner"), gr.Checkbox(True, label="Use Google Sheets"), "checkbox"],
391
  outputs=[gr.Markdown("LOGS", height="250px", elem_id="authorscroll"), gr.Markdown("", height="100px", elem_id="authorscroll"), "file", "audio", "textbox"],
392
  title="Web Scraper",
393
  description=description,
 
12
  import random
13
  import string
14
  import json
 
15
  import crypto
16
 
17
  auth = [
 
32
  2. **Multiple Issues Scraping:**
33
  - Use curly braces `{}` in the URL to indicate where the volume (`v`) and issue (`i`) numbers should be inserted.
34
  - Define the range for volumes, not issues. Ensure you pass the volume range correctly.
35
+ - To pass the range you must have use () in start of the url something like (to:from)-link/{v}/{i} checkout example below.
36
+ - Now you can also pass the multiple links too and add range into same as above.
37
 
38
  3. **Read this before using google sheet feature**
39
  - **IMPORTANT** First make a google drive folder and then gave access to `[email protected]` and `[email protected]` super important.
 
56
  """
57
 
58
  exmpl = [
59
+ ["https://www.ams.org/journals/jams/2024-37-01/home.html?active=allissues", "[email protected]", "asdfasdfasdfasdfasdf", "example1", True, True, False],
60
+ ["https://www.degruyter.com/journal/key/fca/20/2/html", "[email protected]", "asdfasdfasdfasdfasdf", "example2", True, True, False],
61
+ ["(22:23)-https://www.degruyter.com/journal/key/fca/{v}/{i}/html", "[email protected]", "asdfasdfasdfasdfasdf", "example3", True, True, False],
62
+ ["https://www.aimspress.com/math/article/2024/8/archive-articles", "[email protected]", "asdfasdfasdfasdfasdf", "example4", True, True, False],
63
+ ["(2021:2022)-https://www.aimspress.com/math/article/{v}/{i}/archive-articles", "[email protected]", "asdfasdfasdfasdfasdf", "example5", True, True, False],
64
+ ["https://link.springer.com/journal/208/volumes-and-issues/388-3", "[email protected]", "asdfasdfasdfasdfasdf", "example6", True, True, False],
65
+ ["(388:389)-https://link.springer.com/journal/208/volumes-and-issues/{v}-{i}", "[email protected]", "asdfasdfasdfasdfasdf", "example7", True, True, False],
66
+ ["https://www.sciencedirect.com/journal/advances-in-applied-mathematics/vol/158/suppl/C", "[email protected]", "asdfasdfasdfasdfasdf", "example8", True, True, False],
67
+ ["https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/38/issue/6", "[email protected]", "asdfasdfasdfasdfasdf", "example9", True, True, False],
68
+ ["(37:38)-https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{v}/issue/{i}", "[email protected]", "asdfasdfasdfasdfasdf", "example10", True, True, False],
69
+ ["(37:38)-https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{v}/issue/{i}\n(388:389)-https://link.springer.com/journal/208/volumes-and-issues/{v}-{i}\n(2021:2022)-https://www.aimspress.com/math/article/{v}/{i}/archive-articles\n(22:23)-https://www.degruyter.com/journal/key/fca/{v}/{i}/html", "[email protected]", "asdfasdfasdfasdfasdf", "example11", True, True, False]
70
  ]
71
 
72
  stop_work = False
 
75
  random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=length))
76
  return random_string
77
 
78
+ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, output: str, owner: bool, mail: str, folder_id: str, credit: dict, usegooglesheet: bool, multiUrls: bool):
79
  if len(output.strip()) < 1:
80
  output = generate_random_filename()
81
  if os.path.exists(f"{output}.xlsx"):
 
86
  shet = GoogleSheetAutomator(
87
  [
88
  "Name",
89
+ "Address",
90
+ "Email"
91
  ],
92
  folder_id,
93
  outputfile=output,
 
95
  )
96
  sht = ExcelAutomator([
97
  "Name",
98
+ "Address",
99
+ "Email"
100
  ], output)
101
  filen = True
102
  if "{" in url:
 
268
  yield {"final_output": sht.save_to_file(), "link" : ""}
269
 
270
  elif domain == "www.sciencedirect.com":
271
+ oldtitle = ""
272
  for ur in links:
273
  isu = 1
274
  while True:
 
277
  try:
278
  if filen:
279
  current_url = ur.format(i=isu)
280
+ authors, title = sciencedirect.run(current_url)
281
+ if title == oldtitle:
282
+ break
283
+ else:
284
+ oldtitle = title
285
  isu += 1
286
  else:
287
  current_url = ur
288
+ authors, title = sciencedirect.run(current_url)
289
+ if title == oldtitle:
290
+ break
291
+ else:
292
+ oldtitle = title
293
  except Exception as e:
294
  break
295
  yield {"current_url": current_url, "status": "fetching"}
 
314
  yield {"final_output": sht.save_to_file(), "link" : ""}
315
 
316
  elif domain == "www.sciencedirect.com" and "acta-mathematica-scientia" in url:
317
+ oldtitle = ""
318
  for ur in links:
319
  isu = 1
320
  while True:
 
323
  try:
324
  if filen:
325
  current_url = ur.format(i=isu)
326
+ authors, title = sciencedirect_admaths.run(current_url)
327
+ if title == oldtitle:
328
+ break
329
+ else:
330
+ oldtitle = title
331
  isu += 1
332
  else:
333
  current_url = ur
334
+ authors, title = sciencedirect_admaths.run(current_url)
335
+ if title == oldtitle:
336
+ break
337
+ else:
338
+ oldtitle = title
339
  except Exception as e:
340
  break
341
  yield {"current_url": current_url, "status": "fetching"}
 
374
  authors = []
375
  details = []
376
  final_output = None
377
+ multi_output = None
378
  link = None
379
  From_volume = 0
380
  To_Volume = 0
381
  urls = []
382
  if "\n" in Link:
383
  urls = Link.split("\n")
384
+ multiUrls = True
385
  else:
386
  urls.append(Link)
387
+ multiUrls = False
388
+ print(f"URLS : {urls}")
389
  for url in urls:
390
  if url.startswith("("):
391
+ From_volume = int(url.split("(", 1)[1].split(":")[0])
392
+ To_Volume = int(url.split("(", 1)[1].split(":")[1].split(")", 1)[0])
393
  url = url.split("-", 1)[1].strip()
394
+ try:
395
+ credit = crypto.decrypt(os.getenv("KEY"), os.getenv("GOOGLE_AUTH_CREDENTIALS"))
396
+ credit = json.loads(credit)
397
+ except:
398
+ pass
399
 
400
+ for _, result in enumerate(filterUrlandRun(url, From_volume, To_Volume, Reverse, Output, MakeOwner, Gmail, FolderId, credit=credit, usegooglesheet=UseGoogleSheet, multiUrls=multiUrls)):
401
  if "final_output" in result:
402
+ if multiUrls == False:
403
+ final_output = result["final_output"]
404
+ else:
405
+ multi_output = result["final_output"]
406
  link = result["link"]
407
  else:
408
  if "author" in result:
 
413
  details.append(f"Scraping: {current_url}\n")
414
  authors = authors[-3:] if len(authors) > 3 else authors
415
  details = details[-3:] if len(details) > 3 else details
416
+ if multiUrls:
417
+ final_output = None
418
  yield "\n".join(authors), "\n".join(details), final_output if final_output else None, gr.Audio("notification.mp3", autoplay=True) if final_output else None, link
419
+ if multiUrls == True:
420
+ yield "\n".join(authors), "\n".join(details), multi_output, gr.Audio("notification.mp3", autoplay=True), link
421
 
422
  interface = gr.Interface(
423
  fn=handle_url,
424
+ inputs=[gr.TextArea(label="Url / Url's", placeholder="Enter the url or multiple urls to scrap"), gr.Textbox(label="Access Gmail (Check Docs)"), gr.Textbox(label="Google Folder ID (Check Docs)"), gr.Textbox(label="Output File Name"), gr.Checkbox(True, label="Make Owner"), gr.Checkbox(True, label="Use Google Sheets"), "checkbox"],
425
  outputs=[gr.Markdown("LOGS", height="250px", elem_id="authorscroll"), gr.Markdown("", height="100px", elem_id="authorscroll"), "file", "audio", "textbox"],
426
  title="Web Scraper",
427
  description=description,
sciencedirect.py CHANGED
@@ -80,7 +80,7 @@ Sec-GPC: 1
80
  # url = f"https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{volume}/issue/{issue}"
81
 
82
  data = req.get(url, headers=headers)
83
-
84
  artical_links = []
85
  fullpage = BeautifulSoup(str(data.text), "lxml")
86
  if fullpage.title.string.strip() == last_artical_name:
@@ -169,7 +169,7 @@ Sec-GPC: 1
169
  except:
170
  address = "Not Found"
171
  if address == "Not Found":
172
- address = url
173
  output.append(
174
  {
175
  'Name' : aut['Name'],
 
80
  # url = f"https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{volume}/issue/{issue}"
81
 
82
  data = req.get(url, headers=headers)
83
+ print(f"URL : {data.url}")
84
  artical_links = []
85
  fullpage = BeautifulSoup(str(data.text), "lxml")
86
  if fullpage.title.string.strip() == last_artical_name:
 
169
  except:
170
  address = "Not Found"
171
  if address == "Not Found":
172
+ address = ""
173
  output.append(
174
  {
175
  'Name' : aut['Name'],
sciencedirect_admaths.py CHANGED
@@ -169,7 +169,7 @@ Sec-GPC: 1
169
  except:
170
  address = "Not Found"
171
  if address == "Not Found":
172
- address = url
173
  output.append(
174
  {
175
  'Name' : aut['Name'],
 
169
  except:
170
  address = "Not Found"
171
  if address == "Not Found":
172
+ address = ""
173
  output.append(
174
  {
175
  'Name' : aut['Name'],