1import langid
2import re
3import os, sys, json, csv
4
5
6
7from pathlib import Path
8from urllib.parse import urlparse
9
10def is_valid_text_naive(text):
11 stripped_text = text.strip()
12 return stripped_text and len(stripped_text) > 3
13
14def remove_fragments_from_url(url):
15 parsed_url = urlparse(url)
16 return parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path
17
18
19def load_env_var_in_google_colab():
20
21
22 if 'google.colab' in sys.modules:
23
24 try:
25 from google.colab import userdata
26 environ_variables = ["REDIS_HOST", "REDIS_PASSWORD", "REDIS_PORT", "mongo_password", "mongo_username"]
27 for env_var in environ_variables:
28 try:
29 os.environ[env_var] = userdata.get(env_var)
30 except Exception as Ex:
31 print(Ex)
32
33 os.environ.get("mongo_password")
34 except Exception as Ex:
35 print(Ex)
36
37def merge_crawled_json_files():
38 '''
39 * Crawled data is saved in .json files.
40 * There are multiple .json files.
41 * this functionoi Combines them.
42 '''
43 data_files = set([
44 file.split('_')[0].split('.json')[0] for file in os.listdir()
45 if (file.endswith('.json')) and (
46 file not in
47 ['test.json', 'news_start_urls copy.json', 'news_start_urls.json'])
48 ])
49 merged_data = []
50 for file in data_files:
51 if remove_file_if_empty(file):
52
53 continue
54 try:
55 with open(file, 'r') as f:
56 data = json.load(f)
57 except:
58 '''
59 file is corrupt when scrapy is terminated while it is still crawling.
60 while corrupt, file is terminated with: `{some_data},`
61 adding : `""]` at the end of file to make it valid json file
62
63 '''
64 with open(new_file_name, 'a') as file:
65 file.write("\"\"]")
66
67 with open(new_file_name, 'r') as file:
68 data = json.load(file)
69 merged_data.extends(data)
70
71def merge_same_named_json_files(delete_merged=False):
72 '''
73 * Crawled data is saved in .json files.
74 * There are multiple .json files.
75 * this functionoi Combines them.
76 '''
77 data_files = [
78 file for file in os.listdir() if (file.endswith('.json')) and (
79 file not in
80 ['test.json', 'news_start_urls copy.json', 'news_start_urls.json']) and not file.endswith('_merged_.json')
81 ]
82 merged_data = []
83 merged_file_name = '_merged_.json'
84 for file in data_files:
85
86 if os.path.getsize(file) > 10000000:
87 continue
88
89
90
91 print(f'\n\n merged_file_name: {merged_file_name}\n')
92 if remove_file_if_empty(file):
93
94 continue
95 try:
96 with open(file, 'r') as f:
97 data = json.load(f)
98 except:
99 '''
100 file is corrupt when scrapy is terminated while it is still crawling.
101 while corrupt, file is terminated with: `{some_data},`
102 adding : `""]` at the end of file to make it valid json file
103
104 '''
105 with open(file, 'a') as f:
106 f.write(",{}]")
107 print(f'\n\n file: {file}\n')
108 with open(file, 'r') as f:
109 data = json.load(f)
110
111 if os.path.exists(merged_file_name):
112 try:
113 with open(merged_file_name, 'r') as f:
114 old_data = json.load(f)
115 if not old_data:
116 old_data = []
117 old_data.extend(data)
118 except:
119 print(f'\n\n-------------- Not merged_file_name: {merged_file_name}\n')
120 continue
121 else:
122 old_data = data
123 if not old_data:
124 old_data = []
125 if old_data:
126 with open(merged_file_name, 'w') as f:
127 json.dump(old_data, f)
128 if delete_merged:
129 os.remove(file)
130
131def compress_file(input_file_path="nepali_news_dataset.csv",
132 output_file_path="nepali_news_dataset.csv.gz"):
133 '''
134 '''
135 with open(input_file_path, 'rb') as csv_file:
136 with gzip.open(output_file_path, 'wb') as compressed_file:
137 compressed_file.writelines(csv_file)
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211def remove_file_if_empty(file_path):
212 """Checks if a file is empty and removes it if it is.
213
214 Args:
215 file_path (str): The path to the file to check.
216
217 Returns:
218 bool: True if the file was empty and removed, False otherwise.
219 """
220 if os.path.exists(file_path):
221 if os.path.getsize(file_path) == 0:
222
223 try:
224 os.remove(file_path)
225 print(f"Removed empty file: {file_path}")
226 return True
227 except OSError as e:
228 print(f"Error removing file: {e}")
229 return False
230 else:
231 try:
232 if os.path.getsize(file_path) > 10000000:
233
234 return False
235 with open(file_path, 'r') as f:
236 data = json.load(f)
237 except Exception as Ex:
238 print(f'----------------------------------- Exception: {Ex} -----------------------------------\n file_path: {file_path}\n ')
239 with open(file_path, 'a') as f:
240 f.write(",{}]")
241 with open(file_path, 'r') as f:
242 data = json.load(f)
243 if not data:
244
245 try:
246 os.remove(file_path)
247 print(f"Removed empty file: {file_path}")
248 return True
249 except OSError as e:
250 print(f"Error removing file: {e}")
251 return False
252 else:
253 print(f"File is not empty or does not exist: {file_path}")
254 return False
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342def is_nepali_language(text):
343 lang, confidence = langid.classify(text)
344 return lang == 'hi', confidence
345
346
347
348def is_google_drive_link(link):
349
350
351 if link.startswith('https://drive.google.com/file/d/'):
352 drive_id = link.split('https://drive.google.com/file/d/')[1].split(
353 '/')[0].split('?')[0]
354 elif link.startswith('https://drive.google.com/drive/folders/'):
355 drive_id = link.split('https://drive.google.com/drive/folders/')[1].split(
356 '/')[0].split('?')[0]
357 elif link.startswith('https://drive.google.com/open?id='):
358 drive_id = link.split('https://drive.google.com/open?id=')[1].split(
359 '/')[0].split('?')[0]
360 elif link.startswith('https://drive.google.com/drive/u/0/folders/'):
361 drive_id = link.split('https://drive.google.com/drive/u/0/folders/'
362 )[1].split('/')[0].split('?')[0]
363 elif link.startswith('https://drive.google.com/drive/u/1/folders/'):
364 drive_id = link.split('https://drive.google.com/drive/u/1/folders/'
365 )[1].split('/')[0].split('?')[0]
366 elif link.startswith('https://drive.google.com/drive/u/2/folders/'):
367 drive_id = link.split('https://drive.google.com/drive/u/2/folders/'
368 )[1].split('/')[0].split('?')[0]
369 elif link.startswith('https://drive.google.com/drive/u/3/folders/'):
370 drive_id = link.split('https://drive.google.com/drive/u/3/folders/'
371 )[1].split('/')[0].split('?')[0]
372 elif link.startswith('https://drive.google.com/drive/u/0/file/d'):
373 drive_id = link.split('https://drive.google.com/drive/u/0/file/d/'
374 )[1].split('/')[0].split('?')[0]
375 elif link.startswith('https://drive.google.com/drive/u/1/file/d/'):
376 drive_id = link.split('https://drive.google.com/drive/u/1/file/d/'
377 )[1].split('/')[0].split('?')[0]
378 elif link.startswith('https://drive.google.com/drive/u/2/file/d/'):
379 drive_id = link.split('https://drive.google.com/drive/u/2/file/d/'
380 )[1].split('/')[0].split('?')[0]
381 elif link.startswith('https://drive.google.com/drive/u/3/file/d/'):
382 drive_id = link.split('https://drive.google.com/drive/u/3/file/d/'
383 )[1].split('/')[0].split('?')[0]
384 elif link.startswith('https://drive.google.com/drive/mobile/folders/'):
385 drive_id = link.split('https://drive.google.com/drive/mobile/folders/'
386 )[1].split('/')[0].split('?')[0]
387 elif link.startswith('https://drive.google.com/folderview?id='):
388 drive_id = link.split('https://drive.google.com/folderview?id=')[1].split(
389 '/')[0].split('?')[0]
390 else:
391
392 return False, link
393
394 return True, drive_id
395
396
397def is_same_domain(url1, url2):
398 return urlparse(url1).netloc == urlparse(url2).netloc
399
400
401def is_np_domain(url):
402
403 parsed_url = urlparse(url)
404 base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
405
406
407
408 return base_url.endswith('.np') or base_url.endswith('.np/')
409
410def is_special_domain_to_crawl(url):
411
412 return url.startswith("https://www.bbc.com/nepali/") or url.startswith("https://np.usembassy.gov/ne/")
413
414def is_document_or_media(url):
415 is_social_media, _ = is_social_media_link(url)
416 is_document, _, _ = is_document_link(url)
417 if is_social_media or is_document:
418 return True
419 return False
420def should_we_crawl_it(new_url, visited_urls_base=None):
421
422 parsed_url = urlparse(new_url)
423 base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
424
425
426
427 if base_url[-3:] == '.np' and (visited_urls_base == None
428 or len(visited_urls_base) < 50):
429 return base_url, True
430 else:
431 return base_url, False
432
433
434
435def is_social_media_link(link):
436 social_media_patterns = {
437 'facebook': r"facebook\.com",
438 'instagram': r"instagram\.com",
439 'twitter': r"twitter\.com",
440 'x': r"x\.com",
441 'tiktok': r"tiktok\.com",
442 'linkedin': r"linkedin\.com",
443 'pinterest': r"pinterest\.com",
444 'youtube': r"youtube\.com|youtu\.be",
445 'reddit': r"reddit\.com",
446 'snapchat': r"snapchat\.com",
447 'quora': r"quora\.com",
448 'tumblr': r"tumblr\.com",
449 'flickr': r"flickr\.com",
450 'medium': r"medium\.com",
451 'vimeo': r"vimeo\.com",
452 'vine': r"vine\.co",
453 'periscope': r"periscope\.tv",
454 'meetup': r"meetup\.com",
455 'mix': r"mix\.com",
456 'soundcloud': r"soundcloud\.com",
457 'behance': r"behance\.net",
458 'dribbble': r"dribbble\.com",
459 'vk': r"vk\.com",
460 'weibo': r"weibo\.com",
461 'ok': r"ok\.ru",
462 'deviantart': r"deviantart\.com",
463 'slack': r"slack\.com",
464 'telegram': r"t\.me|telegram\.me",
465 'whatsapp': r"whatsapp\.com",
466 'line': r"line\.me",
467 'wechat': r"wechat\.com",
468 'kik': r"kik\.me",
469 'discord': r"discord\.gg",
470 'skype': r"skype\.com",
471 'twitch': r"twitch\.tv",
472 'myspace': r"myspace\.com",
473 'badoo': r"badoo\.com",
474 'tagged': r"tagged\.com",
475 'meetme': r"meetme\.com",
476 'xing': r"xing\.com",
477 'renren': r"renren\.com",
478 'skyrock': r"skyrock\.com",
479 'livejournal': r"livejournal\.com",
480 'fotolog': r"fotolog\.com",
481 'foursquare': r"foursquare\.com",
482 'cyworld': r"cyworld\.com",
483 'gaiaonline': r"gaiaonline\.com",
484 'blackplanet': r"blackplanet\.com",
485 'care2': r"care2\.com",
486 'cafemom': r"cafemom\.com",
487 'nextdoor': r"nextdoor\.com",
488 'kiwibox': r"kiwibox\.com",
489 'cellufun': r"cellufun\.com",
490 'tinder': r"tinder\.com",
491 'bumble': r"bumble\.com",
492 'hinge': r"hinge\.co",
493 'match': r"match\.com",
494 'okcupid': r"okcupid\.com",
495 'zoosk': r"zoosk\.com",
496 'plentyoffish': r"pof\.com",
497 'eharmony': r"eharmony\.com",
498 'coffee_meets_bagel': r"coffeemeetsbagel\.com",
499 'her': r"weareher\.com",
500 'grindr': r"grindr\.com",
501 'happn': r"happn\.com",
502 'hily': r"hily\.com",
503 'huggle': r"huggle\.com",
504 'jdate': r"jdate\.com",
505 'lovoo': r"lovoo\.com",
506 'meetmindful': r"meetmindful\.com",
507 'once': r"once\.com",
508 'raya': r"raya\.app",
509 'ship': r"getshipped\.com",
510 'silversingles': r"silversingles\.com",
511 'tastebuds': r"tastebuds\.fm",
512 'the_league': r"theleague\.com",
513 'tudder': r"tudder\.com",
514 'twoo': r"twoo\.com",
515 }
516
517 for social_media, pattern in social_media_patterns.items():
518 if (re.search(r'https://www\.' + pattern, link, flags=re.IGNORECASE)) or (
519 (re.search(r'https://www\.m\.' + pattern, link, flags=re.IGNORECASE))):
520 return True, social_media
521
522 return False, None
523
524
525
526
527
528
529
530def is_document_link(link):
531
532 document_extensions = [
533 '.pdf', '.odt', '.docx', '.doc', '.pst', '.ai', '.drw', '.dxf', '.eps',
534 '.ps', '.svg', '.cdr', '.odg'
535 ]
536
537 presentation_extensions = ['.ppt', '.pptx', '.odp', '.pps']
538
539 spreadsheet_extensions = ['.xls', '.xlsx', '.ods']
540
541 archive_extensions = [
542 '.zip', '.rar', '.tar', '.gz', '.7z', '.7zip', '.bz2', '.tar.gz', '.xz'
543 ]
544
545 video_extensions = [
546 '.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.mng', '.mpg', '.qt',
547 '.rm', '.swf', '.m4v', '.webm'
548 ]
549
550 audio_extensions = [
551 '.mp3', '.wav', '.ogg', '.wma', '.ra', '.aac', '.mid', '.au', '.aiff',
552 '.3gp', '.asf', '.asx', '.m4a'
553 ]
554
555 image_extensions = [
556 '.bmp', '.gif', '.jpg', '.jpeg', '.png', '.tif', '.tiff', '.ico',
557 '.webp', '.mng', '.pct', '.psp'
558 ]
559
560 other_extensions = ['.css', '.exe', '.bin', '.rss', '.dmg', '.iso', '.apk']
561
562 extensions = {
563 'document': document_extensions,
564 'presentation': presentation_extensions,
565 'spreadsheet': spreadsheet_extensions,
566 'archive': archive_extensions,
567 'video': video_extensions,
568 'audio': audio_extensions,
569 'image': image_extensions,
570 'other': other_extensions
571 }
572
573 for doc_type, extension_list in extensions.items():
574 for extension in extension_list:
575 if link.lower().endswith(extension):
576 return True, doc_type, (
577 doc_type == 'other'
578 )
579 return False, None, True
580
581
582
583