# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python-selenium:3.11

# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./

# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
 && python --version \
 && echo "Pip version:" \
 && pip --version \
 && echo "Installing dependencies:" \
 && pip install -r requirements.txt \
 && echo "All installed Python packages:" \
 && pip freeze

# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./

# Use compileall to ensure the runnability of the Actor Python code.
RUN python3 -m compileall -q .

# Specify how to launch the source code of your Actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "my-actor-1",
    "title": "Getting started with Python and Selenium",
    "description": "Scrapes titles of websites using Selenium.",
    "version": "0.0",
    "meta": {
        "templateId": "python-selenium"
    },
    "input": "./input_schema.json",
    "dockerfile": "./Dockerfile",
    "storages": {
        "dataset": {
            "actorSpecification": 1,
            "title": "Name of repo and stars",
            "views": {
                "titles": {
                    "title": "Name of repo and stars",
                    "transformation": {
                        "fields": [
                            "repo_name",
                            "stars_count"
                        ]
                    },
                    "display": {
                        "component": "table",
                        "properties": {
                            "repo_name": {
                                "label": "Name",
                                "format": "text"
                            },
                            "star_count": {
                                "label": "Stars",
                                "format": "text"
                            }
                        }
                    }
                }
            }
        }
    }
}

.actor/input_schema.json

{
    "title": "Python Selenium Scraper",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "start_urls": {
            "title": "Start URLs",
            "type": "array",
            "description": "URLs to start with",
            "prefill": [
                { "url": "https://github.com/apify/crawlee" }
            ],
            "editor": "requestListSources"
        },
        "max_depth": {
            "title": "Maximum depth",
            "type": "integer",
            "description": "Depth to which to scrape to",
            "default": 1
        }
    },
    "required": ["start_urls"]
}

src/main.py

1"""
2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging
3settings. The `main()` coroutine is then executed using `asyncio.run()`.
4
5Feel free to modify this file to suit your specific needs.
6"""
7
8import asyncio
9import logging
10
11from apify.log import ActorLogFormatter
12
13from .main import main
14
15# Configure loggers
16handler = logging.StreamHandler()
17handler.setFormatter(ActorLogFormatter())
18
19apify_client_logger = logging.getLogger('apify_client')
20apify_client_logger.setLevel(logging.INFO)
21apify_client_logger.addHandler(handler)
22
23apify_logger = logging.getLogger('apify')
24apify_logger.setLevel(logging.DEBUG)
25apify_logger.addHandler(handler)
26
27# Execute the Actor main coroutine
28asyncio.run(main())

src/main.py

1from selenium import webdriver
2from selenium.webdriver.chrome.options import Options as ChromeOptions
3from selenium.webdriver.common.by import By
4from selenium.webdriver.support.ui import WebDriverWait
5from selenium.webdriver.support import expected_conditions as EC
6from apify import Actor
7
8async def main() -> None:
9    async with Actor() as actor:
10        repo_url = 'https://github.com/apify/crawlee'  # Example repository URL
11
12        chrome_options = ChromeOptions()
13        if actor.config.headless:
14            chrome_options.add_argument('--headless')
15        chrome_options.add_argument('--no-sandbox')
16        chrome_options.add_argument('--disable-dev-shm-usage')
17        driver = webdriver.Chrome(options=chrome_options)
18
19        try:
20            driver.get(repo_url)
21
22            wait = WebDriverWait(driver, 10)
23            repo_name_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "strong a[href*='/apify/crawlee']")))
24            repo_name = repo_name_element.text.strip() if repo_name_element else 'Repo name not found'
25
26            stars_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "a.Link--muted[href*='/stargazers'] strong")))
27            stars_count = stars_element.text.strip() if stars_element else '0'
28
29
30
31            print(f"Repository: {repo_name}, Stars: {stars_count}")
32            await actor.push_data({
33                'repo_url': repo_url,
34                'repo_name': repo_name,
35                'stars_count': stars_count
36            })
37
38        except Exception as e:
39            actor.log.exception(f'Cannot extract data from {repo_url}. Exception: {e}')
40        finally:
41            driver.quit()

.dockerignore

# configurations
.idea

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
.venv

# git folder
.git

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.gitignore

# This file tells Git which files shouldn't be added to source control

.idea
.DS_Store

apify_storage
storage

.venv/
.env/
__pypackages__
dist/
build/
*.egg-info/
*.egg

__pycache__

.mypy_cache
.dmypy.json
dmypy.json
.pytest_cache
.ruff_cache

.scrapy
*.log

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify ~= 1.5.1
5selenium ~= 4.14.0

Github Profile Scraper

saswave/github-profile-scraper

GitHub User Profile Scraper. Extracts data from GitHub profiles, including followers, following, LinkedIn, Twitter, achievements and much more. Ideal for developers, researchers, and marketers. From a list of Github profile or a repository stargazers link

SASWAVE

107

GitHub Repository Scraper

vulnv/github-repository-scraper

Scrape and extract GitHub repository data, metadata, statistics, stars, forks, issues, and project information from multiple repositories at once.

VulnV

5.0

Github Users Scraper

dtrungtin/github-users-scraper

Github Users Scraper is an Apify actor for extracting users or emails from Github. It allows you to extract all watchers, stargazers, and members from a repository page.

Tin

233

4.0

Github User Profile Scraper

powerful_bachelor/Github-User-Profile-Scraper

The GitHub User Profile Scraper extracts vital info from GitHub profiles, including followers, following, LinkedIn, Twitter, achievements and much more. Ideal for developers, researchers, and marketers, it supports multiple profiles and exports data in various formats.

Powerful Bachelor

Github Search Scraper

saswave/github-search-scraper

Github search scraper. Get all data from search results list

SASWAVE

Github Profile Scraper

vulnv/github-profile-scraper

Scrapes GitHub user profiles including bio, repositories, followers, contributions, and more. Accepts a list of usernames and extracts comprehensive profile data.

VulnV

5.0

GitHub Repository Scraper

fresh_cliff/github-scraper

This actor scrapes detailed information from GitHub repositories using reliable HTTP requests and HTML parsing. It extracts repository metadata including star counts, fork counts, topics/tags, license information, primary programming language, and last updated timestamps.

Brennan Crawford

Github Repo User Scraper

inquisitive_sarangi/github-repo-scraper

Github Repo User Scraper is simple tool to extract users of a repo(s) like contributors, stargazers & watchers. You can also export listings to JSON/CSV or any other as format.

API Master

Github Repo Markdown Scraper

louisdeconinck/github-repo-markdown-scraper

Transform GitHub repositories into a single, comprehensive markdown document effortlessly. Our tool streamlines analysis and processing, offering configurable file size limits, pattern filtering, and batch processing. Perfect for LLM AI prompts, it handles large repositories with ease.

Louis Deconinck

5.0

Github Champion

mihails/github-champion

Find your organization's top contributors. In every time period, performance is measured by the number of assigned issues closed, number of PR reviews, and number of PRs opened.