1import socket
2import os
3from selenium import webdriver
4from selenium.webdriver.common.by import By
5import itertools
6from time import sleep
7import urllib
8from selenium.common.exceptions import NoSuchElementException
9from selenium.webdriver.chrome.options import Options
10import json
11import pandas as pd
12import requests
13import re
14import datetime
15from typing import Dict
16
17
18class JobScraper:
19 def __init__(self):
20 pass
21
22 def search_jobs(self, query, N_jobs=100):
23 pass
24
25
26class GoogleJobsScraper(JobScraper):
27 def __init__(self,actor_input,default_dataset_client):
28
29 hostname=socket.gethostname()
30 IPAddr=socket.gethostbyname(hostname)
31 print(hostname)
32 print(IPAddr)
33
34 self.default_dataset_client=default_dataset_client
35 self.actor_input=actor_input
36 self.counter = 0
37 self.__initialize_driver()
38
39 def __initialize_driver(self):
40 options=Options()
41 options.headless=True
42 self.driver = webdriver.Firefox(executable_path="./geckodriver")
43 self.driver.get("http://www.google.de")
44
45 print("Initializing Google Jobs Scraper")
46
47 def prepare(self):
48 query = f"https://www.google.com/search?q=AI&ibp=htl;jobs#htivrt=jobs"
49 self.driver.get(query)
50 sleep(4)
51 html_source_code = self.driver.execute_script("return document.body.innerHTML;")
52
53
54 accept_btn = self.driver.find_element(By.XPATH, "//button[normalize-space()='Alle akzeptieren']")
55 accept_btn.click()
56 sleep(0.2)
57
58 def search_jobs(self, actor_input, N_jobs=100,):
59 jobs_data = []
60 query=actor_input["query"]
61 if "location" in actor_input:
62 query=query+"%20"+actor_input["location"]
63
64 query = f"https://www.google.com/search?q={query}&ibp=htl;jobs#htivrt=jobs"
65 print("Scraping page: "+query)
66 self.driver.get(query)
67
68
69
70
71 listing_idx = 0
72 old_listing_len = 0
73 for page in range(actor_input["n_pages"]):
74 print("Loading page: "+str(page))
75
76 listings = self.driver.find_elements(By.XPATH, "//div[@class='PwjeAc']")
77 if len(listings)==0:
78 print("No jobs found on current page.")
79 return jobs_data
80 for idx, listing in enumerate(listings[old_listing_len:]):
81 listing.click()
82 job_data = self.__get_job_details()
83
84 self.default_dataset_client.push_items([job_data])
85
86 old_listing_len = len(listings)
87 self.__scroll_into_view(listings[-1])
88 return jobs_data
89
90 def __scroll_into_view(self, element):
91 self.driver.execute_script("arguments[0].scrollIntoView(true);", element)
92
93 def __get_job_details(self):
94 print(self._get_job_name())
95 job = {
96
97 "name": self._get_job_name(),
98 "job_link": self._get_job_links()[0]["link"][:100],
99 "company": self._get_company(),
100
101 "enrichment": {"occupation_type": self._get_occupation_type()},
102 "location": self._get_location()
103 }
104 if(self.actor_input["extract_descriptions"]):
105 job["description"]=self._get_job_description()
106
107 return job
108
109 def _get_job_name(self):
110 job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")
111 job_name = job_container.find_element(By.XPATH, ".//h2[@class='KLsYvd']").text
112 return job_name
113
114 def _get_job_links(self):
115 job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")
116 job_links = job_container.find_elements(By.XPATH, ".//a[@class='pMhGee Co68jc j0vryd']")
117 if len(job_links) == 0:
118 job_links = job_container.find_elements(By.XPATH, ".//a[@class='pMhGee Co68jc j0vryd zixIx']")
119 if len(job_links) == 0:
120 print("No links could be found")
121 return []
122
123 job_links_data = []
124 for job_link in job_links:
125 entry = {"name": job_link.text, "link": job_link.get_attribute("href")}
126 job_links_data.append(entry)
127 return job_links_data
128
129 def _get_job_id(self):
130 parsed_url = urllib.parse.urlparse(self.driver.current_url)
131 id = urllib.parse.parse_qs(parsed_url.fragment)['htidocid'][0]
132 return id
133
134 def _get_location(self):
135 job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")
136 location = job_container.find_element(By.XPATH, ".//div[@class='sMzDkb']").text
137 return location
138
139 def _get_occupation_type(self):
140 job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")
141 occupation_type = job_container.find_elements(By.XPATH, ".//span[@class='LL4CDc']")[-1].text
142 return occupation_type
143
144 def _get_company(self):
145 job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")
146 company = job_container.find_element(By.XPATH, ".//div[@class='nJlQNd sMzDkb']").text
147 return company
148
149 def _get_job_description(self):
150 job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")
151 try:
152 expand_description_button = job_container.find_element(By.XPATH,
153 "div/div/div/div/div/div/div[@class='CdXzFe j4kHIf']")
154 self.__scroll_into_view(expand_description_button)
155 try:
156 expand_description_button.click()
157 except:
158 pass
159 except NoSuchElementException:
160 pass
161 description = job_container.find_element(By.XPATH, ".//span[@class='HBvzbc']").text
162 return description