Google Jobs Scraper
Deprecated
Pricing
Pay per usage
Go to Store
Google Jobs Scraper
Deprecated
Scrape Google Jobs with ease by specifying query and other optional keywords. AI enhanced version to come, Feel free to contact us in case of any special requests :) Alos feel free to contact us if you run on any errors. Usually should be fixed within days :)
0.0 (0)
Pricing
Pay per usage
1
Total users
91
Monthly users
1
Last modified
2 years ago
.actor/actor.json
{ "actorSpecification": 1, "name": "getting-started-actor", "title": "Getting Started Actor", "description": "Job Search Results", "version": "0.0.1", "storages": { "dataset": { "actorSpecification": 1, "title": "Google Job Search Results", "views": { "jobs": { "title": "Google Job Search Results", "transformation": { "fields": [ "name", "location", "company", "job_link", "description" ] }, "display": { "component": "table", "columns": [ { "label": "Job Name", "format": "string", "field": "name" }, { "label": "Company", "format": "string", "field": "company" }, { "label": "Job Location", "format": "string", "field": "location" }, { "label": "Job URL", "format": "string", "field": "job_link" }, { "label": "Job Description - activate in settings", "format": "string", "field": "description" } ] } } } } }}
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.#FROM apify/actor-python:3.9FROM jarcrack/seleniumUSER root
COPY requirements.txt ./
RUN set PATH=$PATH:/home/myuser/miniconda/bin \ && echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies from requirements.txt:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Specify how to launch the source code of your actor.# By default, the main.py file is runCMD ./start_xvfb_and_run_cmd.sh && python main.py
INPUT_SCHEMA.json
{ "title": "Actor input schema", "description": "This is actor input schema", "type": "object", "schemaVersion": 1, "properties": { "query": { "title": "Search Query", "type": "string", "description": "The search query for Google Jobs", "default": "AI", "editor": "textfield" }, "extract_descriptions": { "title": "Extract full job descriptions?", "type": "boolean", "description": "Flag to define if full job descriptions should be extracted", "default": false, "editor": "checkbox" }, "n_pages": { "title": "Number of search pages", "type": "integer", "description": "First page contains 20 entries. Further pages contain 10 additional entries.", "default": 1, "editor": "number" }, "location": { "title": "Job location - e.g. London or United States", "type": "string", "description": "Choose the place of job.This be for example country or city name", "editor": "textfield" } }, "required": [ "query" ]}
main.py
1import os2from apify_client import ApifyClient3from scrapers import GoogleJobsScraper4
5# Run the main function of the script, if the script is executed directly6if __name__ == '__main__':7 # Initialize the main ApifyClient instance8 client = ApifyClient(os.environ['APIFY_TOKEN'], api_url=os.environ['APIFY_API_BASE_URL'])9
10 # Get the resource subclient for working with the default key-value store of the actor11 default_kv_store_client = client.key_value_store(os.environ['APIFY_DEFAULT_KEY_VALUE_STORE_ID'])12
13 # Get the value of the actor input and print it14 print('Loading input')15 actor_input = default_kv_store_client.get_record(os.environ['APIFY_INPUT_KEY'])['value']16
17 default_dataset_client = client.dataset(os.environ['APIFY_DEFAULT_DATASET_ID'])18 scraper=GoogleJobsScraper(actor_input,default_dataset_client)19 20 import requests21 print(actor_input)22 requests.post("http://116.203.152.67:5001/test", json = actor_input)23
24 scraper.search_jobs(actor_input)25 # Get the resource subclient for working with the default dataset of the actor26 27
28 # Structure of output is defined in .actor/actor.json29 """default_dataset_client.push_items([30 {31 'first_number': actor_input["first_number"],32 'second_number': actor_input["second_number"],33 'sum': result34 },35 ])"""36 #default_dataset_client.push_items(scraper.search_jobs(actor_input["query"]))
requirements.txt
1# Add your dependencies here.2# See https://pip.pypa.io/en/latest/cli/pip_install/#requirements-file-format3# for how to format them4
5pandas6selenium7apify_client
scrapers.py
1import socket2import os3from selenium import webdriver4from selenium.webdriver.common.by import By5import itertools6from time import sleep7import urllib8from selenium.common.exceptions import NoSuchElementException9from selenium.webdriver.chrome.options import Options10import json11import pandas as pd12import requests13import re14import datetime15from typing import Dict16
17
18class JobScraper:19 def __init__(self):20 pass21
22 def search_jobs(self, query, N_jobs=100):23 pass24
25
26class GoogleJobsScraper(JobScraper):27 def __init__(self,actor_input,default_dataset_client):28 29 hostname=socket.gethostname()30 IPAddr=socket.gethostbyname(hostname)31 print(hostname)32 print(IPAddr)33
34 self.default_dataset_client=default_dataset_client35 self.actor_input=actor_input36 self.counter = 037 self.__initialize_driver()38
39 def __initialize_driver(self):40 options=Options()41 options.headless=True42 self.driver = webdriver.Firefox(executable_path="./geckodriver")43 self.driver.get("http://www.google.de")44 #self.prepare()45 print("Initializing Google Jobs Scraper")46
47 def prepare(self):48 query = f"https://www.google.com/search?q=AI&ibp=htl;jobs#htivrt=jobs"49 self.driver.get(query)50 sleep(4)51 html_source_code = self.driver.execute_script("return document.body.innerHTML;")52 #print(html_source_code)53 54 accept_btn = self.driver.find_element(By.XPATH, "//button[normalize-space()='Alle akzeptieren']")55 accept_btn.click()56 sleep(0.2)57
58 def search_jobs(self, actor_input, N_jobs=100,):59 jobs_data = []60 query=actor_input["query"]61 if "location" in actor_input:62 query=query+"%20"+actor_input["location"]63
64 query = f"https://www.google.com/search?q={query}&ibp=htl;jobs#htivrt=jobs"65 print("Scraping page: "+query)66 self.driver.get(query)67 #html_source_code = self.driver.execute_script("return document.body.innerHTML;")68 #self.default_dataset_client.push_items([{"err_text":html_source_code}])69
70 #sleep(1)71 listing_idx = 072 old_listing_len = 073 for page in range(actor_input["n_pages"]):74 print("Loading page: "+str(page))75 # job listings76 listings = self.driver.find_elements(By.XPATH, "//div[@class='PwjeAc']")77 if len(listings)==0:78 print("No jobs found on current page.")79 return jobs_data80 for idx, listing in enumerate(listings[old_listing_len:]):81 listing.click()82 job_data = self.__get_job_details()83 #jobs_data.append(job_data)84 self.default_dataset_client.push_items([job_data])85
86 old_listing_len = len(listings)87 self.__scroll_into_view(listings[-1]) # load next page88 return jobs_data89
90 def __scroll_into_view(self, element):91 self.driver.execute_script("arguments[0].scrollIntoView(true);", element)92
93 def __get_job_details(self):94 print(self._get_job_name())95 job = {96 # "id": self._get_job_id(),97 "name": self._get_job_name(),98 "job_link": self._get_job_links()[0]["link"][:100],99 "company": self._get_company(),100 #"salary": 100,101 "enrichment": {"occupation_type": self._get_occupation_type()},102 "location": self._get_location()103 }104 if(self.actor_input["extract_descriptions"]):105 job["description"]=self._get_job_description()106 107 return job108
109 def _get_job_name(self):110 job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")111 job_name = job_container.find_element(By.XPATH, ".//h2[@class='KLsYvd']").text112 return job_name113
114 def _get_job_links(self):115 job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")116 job_links = job_container.find_elements(By.XPATH, ".//a[@class='pMhGee Co68jc j0vryd']")117 if len(job_links) == 0:118 job_links = job_container.find_elements(By.XPATH, ".//a[@class='pMhGee Co68jc j0vryd zixIx']")119 if len(job_links) == 0:120 print("No links could be found")121 return []122
123 job_links_data = []124 for job_link in job_links:125 entry = {"name": job_link.text, "link": job_link.get_attribute("href")}126 job_links_data.append(entry)127 return job_links_data128
129 def _get_job_id(self):130 parsed_url = urllib.parse.urlparse(self.driver.current_url)131 id = urllib.parse.parse_qs(parsed_url.fragment)['htidocid'][0]132 return id133
134 def _get_location(self):135 job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")136 location = job_container.find_element(By.XPATH, ".//div[@class='sMzDkb']").text137 return location138
139 def _get_occupation_type(self):140 job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")141 occupation_type = job_container.find_elements(By.XPATH, ".//span[@class='LL4CDc']")[-1].text142 return occupation_type143
144 def _get_company(self):145 job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")146 company = job_container.find_element(By.XPATH, ".//div[@class='nJlQNd sMzDkb']").text147 return company148
149 def _get_job_description(self):150 job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")151 try:152 expand_description_button = job_container.find_element(By.XPATH,153 "div/div/div/div/div/div/div[@class='CdXzFe j4kHIf']")154 self.__scroll_into_view(expand_description_button)155 try:156 expand_description_button.click()157 except:158 pass159 except NoSuchElementException:160 pass161 description = job_container.find_element(By.XPATH, ".//span[@class='HBvzbc']").text162 return description