1import datetime
2import re
3
4months = {
5 '01':'01','1':'1',
6 '02':'02','2':'2',
7 '03':'03','3':'3',
8 '04':'04','4':'4',
9 '05':'05','5':'5',
10 '06':'06','6':'6',
11 '07':'07','7':'7',
12 '08':'08','8':'8',
13 '09':'09','9':'9',
14 '10':'10',
15 '11':'11',
16 '12':'12',
17 '13':'13',
18 '14':'14',
19 '15':'15',
20 '16':'16',
21 '17':'17',
22 '18':'18',
23 '19':'19',
24 '20':'20',
25 '21':'21',
26 '22':'22',
27 '23':'23',
28 '24':'24',
29 '25':'25',
30 '26':'26',
31 '27':'27',
32 '28':'28',
33 '29':'29',
34 '30':'30',
35 '31':'31',
36 'jan':'01','januari':'01','january':'01','janvier':'01','જાન્યુઆરી':'01','जनवरी':'01','जानेवारी':'01','jany':'01', 'জানুয়ারি' : '01',
37 'feb':'02','februari':'02','february':'02','février':'02','ફેબ્રુઆરી':'02','फरवरी':'02','फेब्रुवारी':'02','ഫെബ്രുവരി':'02', 'ফেব্রুয়ারি' : '02',
38 'mar':'03','march':'03','mars':'03','માર્ચ':'03','मार्च':'03','maart':'03','মার্চ':'03',
39 'apr':'04','april':'04','avril':'04','એપ્રિલ':'04','अप्रैल':'04','एप्रिल':'04','এপ্রিল':'04',
40 'may':'05','mei':'05','may':'05','mai':'05','મે':'05','मई':'05','मे':'05','মে':'05', 'മെയ്' : '05',
41 'jun':'06','june':'06','juin':'06','juni':'06','જૂન':'06','जून':'06','জুন':'06',
42 'jul':'07','july':'07','juillet':'07','જુલાઈ':'07','जुलाई':'07','जुलै':'07','জুলাই':'07', 'juli' : '07', 'jyuly' : '07', 'ജൂലൈ' : '07',
43 'aug':'08','august':'08','août':'08','ઓગસ્ટ':'08','अगस्त':'08','ऑगस्ट':'08', 'augustus' : '08', 'আগস্ট' : '08',
44 'sep':'09','sept':'09','september':'09','septembre':'09','સપ્ટેમ્બર':'09','सितम्बर':'09','सप्टेंबर':'09','সেপ্টেম্বর':'09',
45 'oct':'10','oktober':'10','october':'10','octobre':'10','ઓક્ટોબર':'10','अक्टूबर':'10','ऑक्टोबर':'10', 'অক্টোবর' : '10',
46 'nov':'11','november':'11','novembre':'11','નવેમ્બર':'11','नवम्बर':'11','नोव्हेंबर':'11','नवंबर':'11','নভেম্বর':'11',
47 'dec':'12','december':'12','décembre':'12','ડિસેમ્બર':'12','दिसंबर':'12','डिसेंबर':'12','ডিসেম্বর':'12'
48 }
49months_pattern = r'[\u0900-\u097F]+|[\u0A80-\u0AFF]+|[\u0C00-\u0C7F]+|[\u0D00-\u0D7F]+|[\u0B80-\u0BFF]+|[\u0A80-\u0AFF]+|[\u0A00-\u0A7F]+|[\u0B00-\u0B7F]+|[\u0980-\u09FF]+|[\u0600-\u06FF]+|[A-Za-zéû]+'
50url_pattern = r'[\w\:\/\.\?\[\]\=\&\;\-\%]+'
51this_month = datetime.datetime.now()
52this_hour = datetime.datetime(this_month.year,this_month.month,this_month.day,this_month.hour)
53this_day = datetime.datetime(this_month.year,this_month.month,this_month.day)
54this_month = datetime.datetime(this_month.year,this_month.month,1)
55this_month = this_month - datetime.timedelta(days=1)
56this_month = datetime.datetime(this_month.year,this_month.month,1)
57
58def convert_css_selector(text):
59 ''' This function converts selenium selectors for scrapy / bs4
60
61 Parameters
62 --------
63 text: str
64 String Selenium Selector
65
66 Returns
67 --------
68 tuple
69 Selector tag with class/id and its value. For eg. (div,class,entry-content)
70 '''
71 if '//' in text:
72 if '/' in text.replace('//',''):
73 return text
74 return list(re.findall('\/\/(\w+)\[\@(\w+)\*?\=["\']([a-zA-Z0-9\:\-\_]+)["\']\]',text)[0])
75 if '[' in text:
76 return list(re.findall('(\w+)\[(\w+)\*?\=["\']([a-zA-Z0-9\:\-\_]+)["\']\]',text)[0])
77 return [text,'','']
78
79def get_text_from_div(content):
80 ''' Extracts plain text from html tags
81
82 Parameters
83 --------
84 content: str
85 Input string
86
87 Returns
88 --------
89 Cleaned string with plain text
90 '''
91 content = content.replace('<br>','\n')
92 temp_content = ''
93 flag = 1
94 for i in range(len(content)):
95 if content[i] == '<':
96 if len(temp_content) and temp_content[-1] == '.':
97 temp_content = temp_content+' '
98 flag = 0
99 temp_content = temp_content+' '
100 continue
101 if content[i] == '>' and content[:i].split('<')[-1].startswith('script') == False and content[:i].split('<')[-1].startswith('style') == False:
102 flag = 1
103 continue
104 if flag:
105 temp_content = temp_content+content[i]
106 while ' ' in temp_content:
107 temp_content = temp_content.replace(' ',' ')
108 return temp_content
109
110def get_website(link):
111 ''' Get website name from link
112 For e.g. https://www.theaustralian.com.au/news/ is changed to theaustralian.com.au
113 https://www.youtube.com/watch?v=..... is changed to youtube.com
114
115 Parameters
116 --------
117 link: str
118 URL text
119
120 Returns
121 --------
122 str
123 website name
124 '''
125 pattern = '[a-z0-9\-]*\.*[a-z0-9\-]*\.*[a-z0-9\-]*\.*[a-z0-9\-]+\.[a-z]+'
126 website = re.findall(pattern,link)[0].replace('www.','').replace('www1.','').split('.')
127 if website[-2] in ['ac','blogspot','co','com','go','indiatimes','nic','net','org','gov']:
128 website = '.'.join(website[-3:])
129 else:
130 website = '.'.join(website[-2:])
131 return website
132
133def to_datetime(uploaddate):
134 ''' Date string is converted to datetime.datetime
135
136 Parameters
137 --------
138 uploaddate: str
139 Date string
140
141 Returns
142 --------
143 datetime.datetime
144 '''
145 pattern = '(\d{4})-(\d{2})-(\d{2})'
146 uploaddate2 = re.findall(pattern,uploaddate)
147 if len(uploaddate2):
148 uploaddate2 = uploaddate2[0]
149 return datetime.datetime(int(uploaddate2[0]),int(uploaddate2[1]),int(uploaddate2[2]))
150 pattern = '(\d{2})-(\d{2})-(\d{2})'
151 try:
152 uploaddate2 = re.findall(pattern,uploaddate)[0]
153 return datetime.datetime(int('20'+uploaddate2[2]),int(uploaddate2[1]),int(uploaddate2[0]))
154 except IndexError:
155 pass
156 pattern = '(\d{4})\/(\d{1,2})\/(\d{1,2})'
157 try:
158 uploaddate2 = re.findall(pattern,uploaddate)[0]
159 return datetime.datetime(int(uploaddate2[0]),int(uploaddate2[1]),int(uploaddate2[2]))
160 except IndexError:
161 pass
162 pattern = '(\d{1,2})\/(\d{1,2})\/(\d{4})'
163 try:
164 uploaddate2 = re.findall(pattern,uploaddate)[0]
165 return datetime.datetime(int(uploaddate2[2]),int(uploaddate2[1]),int(uploaddate2[0]))
166 except IndexError:
167 pass
168 return None
169
170def get_single_field(mode,obj,f,mongo_dict,field):
171 ''' This function gets field using field element information
172
173 Parameters
174 --------
175 mode: str
176 Mode - Scrapy or Selenium
177 obj: scrapy.http.response.html.HtmlResponse / selenium_driver.SeleniumDriver
178 response of webpage
179 f: dict
180 website attributes element
181 mongo_dict: dict
182 news record
183 field: str
184 field to extracted
185 '''
186 if 'index' in f:
187 index = f['index']
188 else:
189 index = 0
190 if mode == 'Scrapy':
191 f['text'] = convert_selenium_to_scrapy(f['text'])
192 if f['text'] == 'url':
193 match mode:
194 case 'Scrapy':
195 mongo_dict[field] = process_field(mode,obj.url,field,f)
196 case 'Selenium':
197 mongo_dict[field] = process_field(mode,obj.current_url,field,f)
198 elif '//' in f['text'][:2]:
199 if 'next' in f:
200 if mode == 'Scrapy':
201 f['next']['text'] = convert_selenium_to_scrapy(f['next']['text'])
202 if '//' in f['next']['text'][:2]:
203 match mode:
204 case 'Scrapy':
205 mongo_dict[field] = process_field(mode,obj.xpath(f['text'])[index].xpath(f['next']['text'])[0],field,f)
206 case 'Selenium':
207 mongo_client[field] = process_field(mode,obj.find_elements(By.XPATH,f['text'])[index].find_element(By.XPATH,f['next']['text']),field,f)
208 else:
209 match mode:
210 case 'Scrapy':
211 mongo_dict[field] = process_field(mode,obj.xpath(f['text'])[index].css(f['next']['text'])[0],field,f)
212 case 'Selenium':
213 mongo_dict[field] = process_field(mode,obj.find_elements(By.XPATH,f['text'])[index].find_element(By.CSS_SELECTOR,f['next']['text']),field,f)
214 else:
215 match mode:
216 case 'Scrapy':
217 mongo_dict[field] = process_field(mode,obj.xpath(f['text'])[index],field,f)
218 case 'Selenium':
219 mongo_dict[field] = process_field(mode,obj.find_elements(By.XPATH,f['text'])[index],field,f)
220 else:
221 if 'next' in f:
222 if mode == 'Scrapy':
223 f['next']['text'] = convert_selenium_to_scrapy(f['next']['text'])
224 if '//' in f['next']['text'][:2]:
225 match mode:
226 case 'Scrapy':
227 mongo_dict[field] = process_field(mode,obj.css(f['text'])[index].xpath(f['next']['text'])[0],field,f)
228 case 'Selenium':
229 mongo_dict[field] = process_field(mode,obj.find_elements(By.CSS_SELECTOR,f['text'])[index].find_element(By.XPATH,f['next']['text']),field,f)
230 else:
231 match mode:
232 case 'Scrapy':
233 mongo_dict[field] = process_field(mode,obj.css(f['text'])[index].css(f['next']['text'])[0],field,f)
234 case 'Selenium':
235 mongo_dict[field] = process_field(mode,obj.find_elements(By.CSS_SELECTOR,f['text'])[index].find_element(By.CSS_SELECTOR,f['next']['text']),field,f)
236 else:
237 match mode:
238 case 'Scrapy':
239 mongo_dict[field] = process_field(mode,obj.css(f['text'])[index],field,f)
240 case 'Selenium':
241 mongo_dict[field] = process_field(mode,obj.find_elements(By.CSS_SELECTOR,f['text'])[index],field,f)
242 if 'split' in f:
243 mongo_dict[field] = mongo_dict[field].split(f['split'])[f['split_index']]
244
245def get_field(mode,obj,x,mongo_dict,field):
246 ''' This function gets field using field element information
247
248 Parameters
249 --------
250 mode: str
251 Mode - Scrapy or Selenium
252 obj: scrapy.http.response.html.HtmlResponse / selenium_driver.SeleniumDriver
253 response of webpage
254 x: dict
255 website attributes element
256 mongo_dict: dict
257 news record
258 field: str
259 field to extracted
260 '''
261 if type(x[field]) == list:
262 for f in x[field]:
263 try:
264 get_single_field(mode,obj,f,mongo_dict,field)
265 break
266 except IndexError:
267 pass
268 else:
269 get_single_field(mode,obj,x[field],mongo_dict,field)
270 if field in mongo_dict and type(mongo_dict[field]) == str:
271 mongo_dict[field] = mongo_dict[field].replace('\n',' ').strip()
272
273def process_field(mode,element,field,f):
274 ''' This function processes the text from an element that is extracted based on what field it is
275 For e.g. for uploaddate, text will be converted to datetime.datetime,
276 for content, all paragraphs will be extracted and joined and so on
277
278 Parameters
279 --------
280 mode: str
281 Mode - Scrapy or Selenium
282 element: scrapy.http.response.html.HtmlResponse / selenium_driver.SeleniumDriver
283 Scrapy element
284 field: str
285 field to be processed (title, subtitle,author, etc)
286 x: dict
287 website attributes element
288
289 Returns
290 --------
291 str/int
292 Processed field (title/subtitle/author, etc)
293 '''
294 print('reached',field)
295 match field:
296 case 'subtitle'|'author'|'title':
297 if field == 'author' and type(element) == list:
298 match mode:
299 case 'Scrapy':
300 return [get_text_from_div(e.extract()).strip() for e in element]
301 case 'Selenium':
302 return [get_text_from_div(e.get_attribute('innerHTML')).strip() for e in element]
303 match mode:
304 case 'Scrapy':
305 return get_text_from_div(element.extract())
306 case 'Selenium':
307 return get_text_from_div(element.get_attribute('innerHTML'))
308 case 'uploaddate':
309 match f['text']:
310 case 'html':
311 if 'today' in f:
312 today = datetime.datetime.now()
313 return datetime.datetime(today.year,today.month,today.day)
314 match mode:
315 case 'Scrapy':
316 uploaddate = element.extract().lower()
317 case 'Selenium':
318 uploaddate = element.get_attribute('innerHTML').lower()
319 if 'to_datetime' in f:
320 return to_datetime(uploaddate.split(f['to_datetime'])[1].split('"')[2][:10])
321 pattern = f['re'].replace('month',months_pattern).replace('d{','\\d{')
322 uploaddate = re.findall(pattern, uploaddate)[0]
323 case 'url':
324 if 're' in f:
325 if 'order' not in f and 'to_datetime' in f:
326 f['order'] = [0,1,2]
327 pattern = f['re'].replace('month',months_pattern).replace('d{','\\d{')
328 uploaddate = re.findall(pattern, element)[0]
329 else:
330 return to_datetime(element)
331 case _:
332 if 'to_datetime' in f and f['to_datetime'] == '':
333 match mode:
334 case 'Scrapy':
335 return to_datetime(element.extract())
336 case 'Selenium':
337 if 'meta[' in f['text']:
338 return to_datetime(element.get_attribute('content'))
339 if f['text'] == 'time':
340 return to_datetime(element.get_attribute('datetime'))
341 return to_datetime(element.get_attribute('innerHTML'))
342 elif f['re'] == 'ago':
343 match mode:
344 case 'Scrapy':
345 return process_uploaddate(element.extract())
346 case 'Selenium':
347 return process_uploaddate(element.get_attribute('innerHTML'))
348 else:
349 pattern = f['re'].replace('month',months_pattern).replace('d{','\\d{')
350 match mode:
351 case 'Scrapy':
352 text = convert_numbers(get_text_from_div(element.extract()).lower())
353 case 'Selenium':
354 if 'meta[' in f['text']:
355 text = convert_numbers(get_text_from_div(element.get_attribute('content')).lower())
356 else:
357 text = convert_numbers(get_text_from_div(element.get_attribute('innerHTML')).lower())
358 uploaddate = re.findall(pattern,text)[0]
359 date_list = [int(uploaddate[f['order'][i]]) if len(uploaddate[f['order'][i]]) == 4 and uploaddate[f['order'][i]].isnumeric() else int(months[uploaddate[f['order'][i]].lower()]) for i in range(len(f['order']))]
360 if len(date_list) == 2:
361 uploaddate = datetime.datetime(datetime.datetime.now().year,date_list[0],date_list[1])
362 if uploaddate < datetime.datetime.now():
363 return uploaddate
364 return datetime.datetime(datetime.datetime.now().year-1,date_list[0],date_list[1])
365 if 'add_2000' in f:
366 date_list[0] = date_list[0]+2000
367 return datetime.datetime(date_list[0],date_list[1],date_list[2])
368 case 'image':
369 return extract_image_and_caption(mode,element)
370 case 'content':
371 if 'p' not in f:
372 f['p'] = True
373 if f['p'] == True:
374 if 'skip_p' in f:
375 match mode:
376 case 'Scrapy':
377 return get_text_from_div('\n'.join([e.extract() for e in element.css('p') if len(re.findall('|'.join(f['skip_p']),get_text_from_div(e.extract()).strip())) == 0 and (len(e.css('a')) and get_text_from_div(e.extract().strip()) == get_text_from_div(e.css('a')[0].extract())) == False]))
378 case 'Selenium':
379 return get_text_from_div('\n'.join([e for e in element.find_element(By.CSS_SELECTOR,'p').get_attribute('innerHTML') if len(re.findall('|'.join(f['skip_p']),e)) == 0]))
380 match mode:
381 case 'Scrapy':
382 return get_text_from_div('\n'.join([e.extract() for e in element.css('p') if (len(e.css('a')) and get_text_from_div(e.extract().strip()) == get_text_from_div(e.css('a')[0].extract())) == False])).strip()
383 case 'Selenium':
384 return get_text_from_div('\n'.join([e.get_attribute('innerHTML') for e in element.find_elements(By.CSS_SELECTOR,'p') if len(e.find_elements(By.CSS_SELECTOR,'a')) == 0 or get_text_from_div(e.find_element(By.CSS_SELECTOR,'a').get_attribute('innerHTML')).strip() != get_text_from_div(e.get_attribute('innerHTML')).strip()]))
385 match mode:
386 case 'Scrapy':
387 return get_text_from_div(element.extract())
388 case 'Selenium':
389 return get_text_from_div(element.get_attribute('innerHTML'))
390 case 'views'|'comments'|'likes'|'dislikes':
391 match mode:
392 case 'Scrapy':
393 count = get_text_from_div(element.extract().lower().replace(',','').replace('like','').replace('comment','').replace('view','')).replace('s','').strip()
394 case 'Selenium':
395 count = get_text_from_div(get_text_from_div(element.get_attribute('innerHTML')).lower().replace(',','').replace('like','').replace('comment','').replace('view','')).replace('s','').strip()
396 if 'k' in count:
397 multiply = 1000
398 count = count.replace('k','').strip()
399 elif 'm' in count:
400 multiply = 1000000
401 count = count.replace('m','').strip()
402 else:
403 multiply = 1
404 count = float(count)*multiply
405 return int(count)
406 case 'tags':
407 match mode:
408 case 'Scrapy':
409 return [get_text_from_div(e.extract()).strip().lower() for e in element.css('a')]
410 case 'Selenium':
411 return [get_text_from_div(e.get_attribute('innerHTML')).strip() for e in element.find_elements(By.CSS_SELECTOR,'a')]
412
413def extract_image_and_caption(mode,image):
414 ''' This function extracts image and caption
415
416 Parameters
417 --------
418 mode: str
419 Mode - Scrapy or Selenium
420 image: scrapy.http.response.html.HtmlResponse / selenium_driver.SeleniumDriver
421 scrapy element
422
423 Returns
424 --------
425 dict:
426 Image url and caption dict
427 '''
428 image_dict = {}
429 match mode:
430 case 'Scrapy':
431 image_dict['url'] = image.css('img')[0].xpath('@src')[0].extract()
432 try:
433 try:
434 caption = image.css('img')[0].xpath('@alt')[0].extract()
435 except IndexError:
436 caption = image.css('img')[0].xpath('@title')[0].extract()
437 except IndexError:
438 caption = ''
439 case 'Selenium':
440 image_dict['url'] = image.find_element(By.CSS_SELECTOR,'img').get_attribute('src')
441 try:
442 try:
443 caption = image.find_element(By.CSS_SELECTOR,'img').get_attribute('alt')
444 except IndexError:
445 caption = image.find_element(By.CSS_SELECTOR,'img').get_attribute('title')
446 except IndexError:
447 pass
448 if caption.strip() != '':
449 image_dict['caption'] = get_text_from_div(caption)
450 return image_dict