Google Jobs Scraper
DeprecatedView all Actors
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsGoogle Jobs Scraper
nigels/jobs-scraper
Scrape Google Jobs with ease by specifying query and other optional keywords. AI enhanced version to come, Feel free to contact us in case of any special requests :) Alos feel free to contact us if you run on any errors. Usually should be fixed within days :)
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "getting-started-actor",
4 "title": "Getting Started Actor",
5 "description": "Job Search Results",
6 "version": "0.0.1",
7 "storages": {
8 "dataset": {
9 "actorSpecification": 1,
10 "title": "Google Job Search Results",
11 "views": {
12 "jobs": {
13 "title": "Google Job Search Results",
14 "transformation": {
15 "fields": [
16 "name",
17 "location",
18 "company",
19 "job_link",
20 "description"
21 ]
22 },
23 "display": {
24 "component": "table",
25 "columns": [
26 {
27 "label": "Job Name",
28 "format": "string",
29 "field": "name"
30 },
31 {
32 "label": "Company",
33 "format": "string",
34 "field": "company"
35 },
36
37 {
38 "label": "Job Location",
39 "format": "string",
40 "field": "location"
41 },
42 {
43 "label": "Job URL",
44 "format": "string",
45 "field": "job_link"
46 },
47 {
48 "label": "Job Description - activate in settings",
49 "format": "string",
50 "field": "description"
51 }
52
53 ]
54 }
55 }
56 }
57 }
58
59 }
60}
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git
Dockerfile
1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4#FROM apify/actor-python:3.9
5FROM jarcrack/selenium
6USER root
7
8COPY requirements.txt ./
9
10RUN set PATH=$PATH:/home/myuser/miniconda/bin \
11 && echo "Python version:" \
12 && python --version \
13 && echo "Pip version:" \
14 && pip --version \
15 && echo "Installing dependencies from requirements.txt:" \
16 && pip install -r requirements.txt \
17 && echo "All installed Python packages:" \
18 && pip freeze
19
20
21# Next, copy the remaining files and directories with the source code.
22# Since we do this after installing the dependencies, quick build will be really fast
23# for most source file changes.
24COPY . ./
25
26
27
28# Specify how to launch the source code of your actor.
29# By default, the main.py file is run
30CMD ./start_xvfb_and_run_cmd.sh && python main.py
INPUT_SCHEMA.json
1{
2 "title": "Actor input schema",
3 "description": "This is actor input schema",
4 "type": "object",
5 "schemaVersion": 1,
6 "properties": {
7 "query": {
8 "title": "Search Query",
9 "type": "string",
10 "description": "The search query for Google Jobs",
11 "default": "AI",
12 "editor": "textfield"
13 },
14 "extract_descriptions": {
15 "title": "Extract full job descriptions?",
16 "type": "boolean",
17 "description": "Flag to define if full job descriptions should be extracted",
18 "default": false,
19 "editor": "checkbox"
20 },
21 "n_pages": {
22 "title": "Number of search pages",
23 "type": "integer",
24 "description": "First page contains 20 entries. Further pages contain 10 additional entries.",
25 "default": 1,
26 "editor": "number"
27 },
28 "location": {
29 "title": "Job location - e.g. London or United States",
30 "type": "string",
31 "description": "Choose the place of job.This be for example country or city name",
32 "editor": "textfield"
33 }
34 },
35 "required": [
36 "query"
37 ]
38}
main.py
1import os
2from apify_client import ApifyClient
3from scrapers import GoogleJobsScraper
4
5# Run the main function of the script, if the script is executed directly
6if __name__ == '__main__':
7 # Initialize the main ApifyClient instance
8 client = ApifyClient(os.environ['APIFY_TOKEN'], api_url=os.environ['APIFY_API_BASE_URL'])
9
10 # Get the resource subclient for working with the default key-value store of the actor
11 default_kv_store_client = client.key_value_store(os.environ['APIFY_DEFAULT_KEY_VALUE_STORE_ID'])
12
13 # Get the value of the actor input and print it
14 print('Loading input')
15 actor_input = default_kv_store_client.get_record(os.environ['APIFY_INPUT_KEY'])['value']
16
17 default_dataset_client = client.dataset(os.environ['APIFY_DEFAULT_DATASET_ID'])
18 scraper=GoogleJobsScraper(actor_input,default_dataset_client)
19
20 import requests
21 print(actor_input)
22 requests.post("http://116.203.152.67:5001/test", json = actor_input)
23
24 scraper.search_jobs(actor_input)
25 # Get the resource subclient for working with the default dataset of the actor
26
27
28 # Structure of output is defined in .actor/actor.json
29 """default_dataset_client.push_items([
30 {
31 'first_number': actor_input["first_number"],
32 'second_number': actor_input["second_number"],
33 'sum': result
34 },
35 ])"""
36 #default_dataset_client.push_items(scraper.search_jobs(actor_input["query"]))
requirements.txt
1# Add your dependencies here.
2# See https://pip.pypa.io/en/latest/cli/pip_install/#requirements-file-format
3# for how to format them
4
5pandas
6selenium
7apify_client
scrapers.py
1import socket
2import os
3from selenium import webdriver
4from selenium.webdriver.common.by import By
5import itertools
6from time import sleep
7import urllib
8from selenium.common.exceptions import NoSuchElementException
9from selenium.webdriver.chrome.options import Options
10import json
11import pandas as pd
12import requests
13import re
14import datetime
15from typing import Dict
16
17
18class JobScraper:
19 def __init__(self):
20 pass
21
22 def search_jobs(self, query, N_jobs=100):
23 pass
24
25
26class GoogleJobsScraper(JobScraper):
27 def __init__(self,actor_input,default_dataset_client):
28
29 hostname=socket.gethostname()
30 IPAddr=socket.gethostbyname(hostname)
31 print(hostname)
32 print(IPAddr)
33
34 self.default_dataset_client=default_dataset_client
35 self.actor_input=actor_input
36 self.counter = 0
37 self.__initialize_driver()
38
39 def __initialize_driver(self):
40 options=Options()
41 options.headless=True
42 self.driver = webdriver.Firefox(executable_path="./geckodriver")
43 self.driver.get("http://www.google.de")
44 #self.prepare()
45 print("Initializing Google Jobs Scraper")
46
47 def prepare(self):
48 query = f"https://www.google.com/search?q=AI&ibp=htl;jobs#htivrt=jobs"
49 self.driver.get(query)
50 sleep(4)
51 html_source_code = self.driver.execute_script("return document.body.innerHTML;")
52 #print(html_source_code)
53
54 accept_btn = self.driver.find_element(By.XPATH, "//button[normalize-space()='Alle akzeptieren']")
55 accept_btn.click()
56 sleep(0.2)
57
58 def search_jobs(self, actor_input, N_jobs=100,):
59 jobs_data = []
60 query=actor_input["query"]
61 if "location" in actor_input:
62 query=query+"%20"+actor_input["location"]
63
64 query = f"https://www.google.com/search?q={query}&ibp=htl;jobs#htivrt=jobs"
65 print("Scraping page: "+query)
66 self.driver.get(query)
67 #html_source_code = self.driver.execute_script("return document.body.innerHTML;")
68 #self.default_dataset_client.push_items([{"err_text":html_source_code}])
69
70 #sleep(1)
71 listing_idx = 0
72 old_listing_len = 0
73 for page in range(actor_input["n_pages"]):
74 print("Loading page: "+str(page))
75 # job listings
76 listings = self.driver.find_elements(By.XPATH, "//div[@class='PwjeAc']")
77 if len(listings)==0:
78 print("No jobs found on current page.")
79 return jobs_data
80 for idx, listing in enumerate(listings[old_listing_len:]):
81 listing.click()
82 job_data = self.__get_job_details()
83 #jobs_data.append(job_data)
84 self.default_dataset_client.push_items([job_data])
85
86 old_listing_len = len(listings)
87 self.__scroll_into_view(listings[-1]) # load next page
88 return jobs_data
89
90 def __scroll_into_view(self, element):
91 self.driver.execute_script("arguments[0].scrollIntoView(true);", element)
92
93 def __get_job_details(self):
94 print(self._get_job_name())
95 job = {
96 # "id": self._get_job_id(),
97 "name": self._get_job_name(),
98 "job_link": self._get_job_links()[0]["link"][:100],
99 "company": self._get_company(),
100 #"salary": 100,
101 "enrichment": {"occupation_type": self._get_occupation_type()},
102 "location": self._get_location()
103 }
104 if(self.actor_input["extract_descriptions"]):
105 job["description"]=self._get_job_description()
106
107 return job
108
109 def _get_job_name(self):
110 job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")
111 job_name = job_container.find_element(By.XPATH, ".//h2[@class='KLsYvd']").text
112 return job_name
113
114 def _get_job_links(self):
115 job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")
116 job_links = job_container.find_elements(By.XPATH, ".//a[@class='pMhGee Co68jc j0vryd']")
117 if len(job_links) == 0:
118 job_links = job_container.find_elements(By.XPATH, ".//a[@class='pMhGee Co68jc j0vryd zixIx']")
119 if len(job_links) == 0:
120 print("No links could be found")
121 return []
122
123 job_links_data = []
124 for job_link in job_links:
125 entry = {"name": job_link.text, "link": job_link.get_attribute("href")}
126 job_links_data.append(entry)
127 return job_links_data
128
129 def _get_job_id(self):
130 parsed_url = urllib.parse.urlparse(self.driver.current_url)
131 id = urllib.parse.parse_qs(parsed_url.fragment)['htidocid'][0]
132 return id
133
134 def _get_location(self):
135 job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")
136 location = job_container.find_element(By.XPATH, ".//div[@class='sMzDkb']").text
137 return location
138
139 def _get_occupation_type(self):
140 job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")
141 occupation_type = job_container.find_elements(By.XPATH, ".//span[@class='LL4CDc']")[-1].text
142 return occupation_type
143
144 def _get_company(self):
145 job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")
146 company = job_container.find_element(By.XPATH, ".//div[@class='nJlQNd sMzDkb']").text
147 return company
148
149 def _get_job_description(self):
150 job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")
151 try:
152 expand_description_button = job_container.find_element(By.XPATH,
153 "div/div/div/div/div/div/div[@class='CdXzFe j4kHIf']")
154 self.__scroll_into_view(expand_description_button)
155 try:
156 expand_description_button.click()
157 except:
158 pass
159 except NoSuchElementException:
160 pass
161 description = job_container.find_element(By.XPATH, ".//span[@class='HBvzbc']").text
162 return description