Google Jobs Scraper avatar
Google Jobs Scraper
Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Google Jobs Scraper

Google Jobs Scraper

nigels/jobs-scraper

Scrape Google Jobs with ease by specifying query and other optional keywords. AI enhanced version to come, Feel free to contact us in case of any special requests :) Alos feel free to contact us if you run on any errors. Usually should be fixed within days :)

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "getting-started-actor",
4    "title": "Getting Started Actor",
5    "description": "Job Search Results",
6    "version": "0.0.1",
7    "storages": {
8        "dataset": {
9            "actorSpecification": 1,
10            "title": "Google Job Search Results",
11            "views": {
12                "jobs": {
13                    "title": "Google Job Search Results",
14                    "transformation": {
15                        "fields": [
16                            "name",
17                            "location",
18                            "company",
19                            "job_link",
20                            "description"
21                        ]
22                    },
23                    "display": {
24                        "component": "table",
25                        "columns": [
26                            {
27                                "label": "Job Name",
28                                "format": "string",
29                                "field": "name"
30                            },
31                            {
32                                "label": "Company",
33                                "format": "string",
34                                "field": "company"
35                            },
36 
37                            {
38                                "label": "Job Location",
39                                "format": "string",
40                                "field": "location"
41                            },
42                            {
43                                "label": "Job URL",
44                                "format": "string",
45                                "field": "job_link"
46                            },
47                            {
48                                "label": "Job Description - activate in settings",
49                                "format": "string",
50                                "field": "description"
51                            }
52                            
53                        ]
54                    }
55                }
56            }
57        }
58        
59    }
60}

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

Dockerfile

1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4#FROM apify/actor-python:3.9
5FROM jarcrack/selenium
6USER root
7
8COPY requirements.txt ./
9
10RUN set PATH=$PATH:/home/myuser/miniconda/bin \ 
11 && echo "Python version:" \
12 && python --version \
13 && echo "Pip version:" \
14 && pip --version \
15 && echo "Installing dependencies from requirements.txt:" \
16 && pip install -r requirements.txt \
17 && echo "All installed Python packages:" \
18 && pip freeze
19
20
21# Next, copy the remaining files and directories with the source code.
22# Since we do this after installing the dependencies, quick build will be really fast
23# for most source file changes.
24COPY . ./
25
26
27
28# Specify how to launch the source code of your actor.
29# By default, the main.py file is run
30CMD ./start_xvfb_and_run_cmd.sh && python main.py

INPUT_SCHEMA.json

1{
2  "title": "Actor input schema",
3  "description": "This is actor input schema",
4  "type": "object",
5  "schemaVersion": 1,
6  "properties": {
7    "query": {
8      "title": "Search Query",
9      "type": "string",
10      "description": "The search query for Google  Jobs",
11      "default": "AI",
12      "editor": "textfield"
13    },
14    "extract_descriptions": {
15      "title": "Extract full job descriptions?",
16      "type": "boolean",
17      "description": "Flag to define if full job descriptions should be extracted",
18      "default": false,
19      "editor": "checkbox"
20    },
21    "n_pages": {
22      "title": "Number of search pages",
23      "type": "integer",
24      "description": "First page contains 20 entries. Further pages contain 10 additional entries.",
25      "default": 1,
26      "editor": "number"
27    },
28    "location": {
29      "title": "Job location - e.g. London or United States",
30      "type": "string",
31      "description": "Choose the place of job.This be for example country or city name",
32      "editor": "textfield"
33    }
34  },
35  "required": [
36    "query"
37  ]
38}

main.py

1import os
2from apify_client import ApifyClient
3from scrapers import GoogleJobsScraper
4
5# Run the main function of the script, if the script is executed directly
6if __name__ == '__main__':
7    # Initialize the main ApifyClient instance
8    client = ApifyClient(os.environ['APIFY_TOKEN'], api_url=os.environ['APIFY_API_BASE_URL'])
9
10    # Get the resource subclient for working with the default key-value store of the actor
11    default_kv_store_client = client.key_value_store(os.environ['APIFY_DEFAULT_KEY_VALUE_STORE_ID'])
12
13    # Get the value of the actor input and print it
14    print('Loading input')
15    actor_input = default_kv_store_client.get_record(os.environ['APIFY_INPUT_KEY'])['value']
16
17    default_dataset_client = client.dataset(os.environ['APIFY_DEFAULT_DATASET_ID'])
18    scraper=GoogleJobsScraper(actor_input,default_dataset_client)
19    
20    import requests
21    print(actor_input)
22    requests.post("http://116.203.152.67:5001/test", json = actor_input)
23
24    scraper.search_jobs(actor_input)
25    # Get the resource subclient for working with the default dataset of the actor
26    
27
28    # Structure of output is defined in .actor/actor.json
29    """default_dataset_client.push_items([
30        {
31            'first_number': actor_input["first_number"],
32            'second_number': actor_input["second_number"],
33            'sum': result
34        },
35    ])"""
36    #default_dataset_client.push_items(scraper.search_jobs(actor_input["query"]))

requirements.txt

1# Add your dependencies here.
2# See https://pip.pypa.io/en/latest/cli/pip_install/#requirements-file-format
3# for how to format them
4
5pandas
6selenium
7apify_client

scrapers.py

1import socket
2import os
3from selenium import webdriver
4from selenium.webdriver.common.by import By
5import itertools
6from time import sleep
7import urllib
8from selenium.common.exceptions import NoSuchElementException
9from selenium.webdriver.chrome.options import Options
10import json
11import pandas as pd
12import requests
13import re
14import datetime
15from typing import Dict
16
17
18class JobScraper:
19    def __init__(self):
20        pass
21
22    def search_jobs(self, query, N_jobs=100):
23        pass
24
25
26class GoogleJobsScraper(JobScraper):
27    def __init__(self,actor_input,default_dataset_client):
28        
29        hostname=socket.gethostname()
30        IPAddr=socket.gethostbyname(hostname)
31        print(hostname)
32        print(IPAddr)
33
34        self.default_dataset_client=default_dataset_client
35        self.actor_input=actor_input
36        self.counter = 0
37        self.__initialize_driver()
38
39    def __initialize_driver(self):
40        options=Options()
41        options.headless=True
42        self.driver = webdriver.Firefox(executable_path="./geckodriver")
43        self.driver.get("http://www.google.de")
44        #self.prepare()
45        print("Initializing Google Jobs Scraper")
46
47    def prepare(self):
48        query = f"https://www.google.com/search?q=AI&ibp=htl;jobs#htivrt=jobs"
49        self.driver.get(query)
50        sleep(4)
51        html_source_code = self.driver.execute_script("return document.body.innerHTML;")
52        #print(html_source_code)
53        
54        accept_btn = self.driver.find_element(By.XPATH, "//button[normalize-space()='Alle akzeptieren']")
55        accept_btn.click()
56        sleep(0.2)
57
58    def search_jobs(self, actor_input, N_jobs=100,):
59        jobs_data = []
60        query=actor_input["query"]
61        if "location" in actor_input:
62            query=query+"%20"+actor_input["location"]
63
64        query = f"https://www.google.com/search?q={query}&ibp=htl;jobs#htivrt=jobs"
65        print("Scraping page: "+query)
66        self.driver.get(query)
67        #html_source_code = self.driver.execute_script("return document.body.innerHTML;")
68        #self.default_dataset_client.push_items([{"err_text":html_source_code}])
69
70        #sleep(1)
71        listing_idx = 0
72        old_listing_len = 0
73        for page in range(actor_input["n_pages"]):
74            print("Loading page: "+str(page))
75            # job listings
76            listings = self.driver.find_elements(By.XPATH, "//div[@class='PwjeAc']")
77            if len(listings)==0:
78                print("No jobs found on current page.")
79                return jobs_data
80            for idx, listing in enumerate(listings[old_listing_len:]):
81                listing.click()
82                job_data = self.__get_job_details()
83                #jobs_data.append(job_data)
84                self.default_dataset_client.push_items([job_data])
85
86            old_listing_len = len(listings)
87            self.__scroll_into_view(listings[-1])  # load next page
88        return jobs_data
89
90    def __scroll_into_view(self, element):
91        self.driver.execute_script("arguments[0].scrollIntoView(true);", element)
92
93    def __get_job_details(self):
94        print(self._get_job_name())
95        job = {
96            # "id": self._get_job_id(),
97            "name": self._get_job_name(),
98            "job_link": self._get_job_links()[0]["link"][:100],
99            "company": self._get_company(),
100            #"salary": 100,
101            "enrichment": {"occupation_type": self._get_occupation_type()},
102            "location": self._get_location()
103        }
104        if(self.actor_input["extract_descriptions"]):
105            job["description"]=self._get_job_description()
106            
107        return job
108
109    def _get_job_name(self):
110        job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")
111        job_name = job_container.find_element(By.XPATH, ".//h2[@class='KLsYvd']").text
112        return job_name
113
114    def _get_job_links(self):
115        job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")
116        job_links = job_container.find_elements(By.XPATH, ".//a[@class='pMhGee Co68jc j0vryd']")
117        if len(job_links) == 0:
118            job_links = job_container.find_elements(By.XPATH, ".//a[@class='pMhGee Co68jc j0vryd zixIx']")
119        if len(job_links) == 0:
120            print("No links could be found")
121            return []
122
123        job_links_data = []
124        for job_link in job_links:
125            entry = {"name": job_link.text, "link": job_link.get_attribute("href")}
126            job_links_data.append(entry)
127        return job_links_data
128
129    def _get_job_id(self):
130        parsed_url = urllib.parse.urlparse(self.driver.current_url)
131        id = urllib.parse.parse_qs(parsed_url.fragment)['htidocid'][0]
132        return id
133
134    def _get_location(self):
135        job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")
136        location = job_container.find_element(By.XPATH, ".//div[@class='sMzDkb']").text
137        return location
138
139    def _get_occupation_type(self):
140        job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")
141        occupation_type = job_container.find_elements(By.XPATH, ".//span[@class='LL4CDc']")[-1].text
142        return occupation_type
143
144    def _get_company(self):
145        job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")
146        company = job_container.find_element(By.XPATH, ".//div[@class='nJlQNd sMzDkb']").text
147        return company
148
149    def _get_job_description(self):
150        job_container = self.driver.find_element(By.XPATH, "//div[@class='whazf bD1FPe']")
151        try:
152            expand_description_button = job_container.find_element(By.XPATH,
153                                                                   "div/div/div/div/div/div/div[@class='CdXzFe j4kHIf']")
154            self.__scroll_into_view(expand_description_button)
155            try:
156                expand_description_button.click()
157            except:
158                pass
159        except NoSuchElementException:
160            pass
161        description = job_container.find_element(By.XPATH, ".//span[@class='HBvzbc']").text
162        return description
Developer
Maintained by Community
Categories