# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
#FROM apify/actor-python:3.11
FROM apify/actor-python-playwright:3.11
# Second, copy just requirements.txt into the actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./

# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
 && python --version \
 && echo "Pip version:" \
 && pip --version \
 && echo "Installing dependencies:" \
 && pip install -r requirements.txt \
 && echo "All installed Python packages:" \
 && pip freeze \
 && playwright install chromium 
 #&& playwright install-deps
 


# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./

# Specify how to launch the source code of your actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "my-actor-1",
    "title": "Scrape single page in Python",
    "description": "Scrape data from single page with provided URL.",
    "version": "0.0",
    "meta": {
        "templateId": "python-start"
    },
    // "input": "./input_schema.json",
    "dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
    "title": "Scrape data from a web page",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "url": {
            "title": "URL of the page",
            "type": "string",
            "description": "The URL of website you want to get the data from.",
            "editor": "textfield",
            "prefill": "https://www.apify.com/"
        }
    },
     "required": ["url"]
}

src/init.py

src/main.py

1import asyncio
2import logging
3
4from apify.log import ActorLogFormatter
5
6from .main import main
7
8# Set up logging of messages from the Apify SDK
9handler = logging.StreamHandler()
10handler.setFormatter(ActorLogFormatter())
11
12apify_client_logger = logging.getLogger('apify_client')
13apify_client_logger.setLevel(logging.INFO)
14apify_client_logger.addHandler(handler)
15
16apify_logger = logging.getLogger('apify')
17apify_logger.setLevel(logging.DEBUG)
18apify_logger.addHandler(handler)
19
20asyncio.run(main())

src/main.py

1# Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/python).
2# Requests - library for making HTTP requests in Python (Read more at https://requests.readthedocs.io)
3# Beautiful Soup - library for pulling data out of HTML and XML files (Read more at https://www.crummy.com/software/BeautifulSoup/bs4/doc)
4# from bs4 import BeautifulSoup
5import requests, asyncio
6from apify import Actor
7import playwright
8from playwright.async_api import async_playwright
9print('................reqbot...............0')
10
11url_adsite = 'https://a000.ex16.repl.co/'
12url_ip = 'https://whatsmyip.com/'
13url_aclick = 'https://click.a-ads.com/2199641/2/'
14url_aipm = 'https://ad.a-ads.com/2199641?size=728x90'
15url_2bt = 'http://traffic2bitcoin.com/ptp2.php?ref=Exash'
16# url_adsterra = 'https://www.highcpmrevenuegate.com/dahcgmdy89?key=cbcfa1e9a4c631faf6ed1b29519abfce'
17url_adsterra = 'https://www.toprevenuegate.com/dahcgmdy89?key=cbcfa1e9a4c631faf6ed1b29519abfce'
18
19async def main():
20    async with Actor:
21        # from playwright.async_api import async_playwright
22        p = await async_playwright().start()
23        BLOCK_RESOURCE_TYPES = [
24        'beacon',
25        'csp_report',
26        'font',
27        'image',
28        'imageset',
29        'media',
30        'object',
31        'texttrack',
32        #  we can even block stylsheets and scripts though it's not recommended:
33        # 'stylesheet',
34        # 'script',  
35        # 'xhr',
36        ]
37
38
39        # we can also block popular 3rd party resources like tracking and advertisements.
40        BLOCK_RESOURCE_NAMES = [
41        'adzerk',
42        'analytics',
43        'cdn.api.twitter',
44        'doubleclick',
45        'exelator',
46        'facebook',
47        'fontawesome',
48        'google',
49        'google-analytics',
50        'googletagmanager',
51        ]
52
53        def intercept_route(route):
54            """intercept all requests and abort blocked ones"""
55            if route.request.resource_type in BLOCK_RESOURCE_TYPES:
56                print(f'blocking background resource {route.request} blocked type "{route.request.resource_type}"')
57                return route.abort()
58            if any(key in route.request.url for key in BLOCK_RESOURCE_NAMES):
59                print(f"blocking background resource {route.request} blocked name {route.request.url}")
60                return route.abort()
61            return route.continue_()
62
63        print('................reqbot...............')
64        # browser = p.chromium.launch(headless=True, ,proxy={'server':proxy_url})
65        # page = browser.new_page()
66        # page.goto(website)
67        # async with async_playwright() as p:
68        # input = await Actor.get_input()
69        # proxy_configuration = await Actor.create_proxy_configuration()
70        # proxy_configuration = await Actor.create_proxy_configuration(groups=['RESIDENTIAL'])
71        for i in range(1000):
72            proxy_configuration = await Actor.create_proxy_configuration(groups=[f'RESIDENTIAL'], country_code = 'FR')
73            proxy_url = await proxy_configuration.new_url(str(i))
74            print(proxy_url)
75            # proxy_url = f'http://groups-RESIDENTIAL:apify_proxy_kLL2nn1MA5Wd468LmDvPExVB8Np06o0fryYf@proxy.apify.com:8000'
76            proxies = {
77                    'http': proxy_url,
78                    'https': proxy_url,
79                }
80            # response = requests.get(input['url'])
81            session = requests.Session()
82            session.proxies = proxies
83
84            # browser = await p.chromium.launch(headless=True, proxy={'server':proxy_url}, args=["--disable-features=Data-saving --user-agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'"])
85            # browser = await p.chromium.launch(headless=True, args=["--disable-features=Data-saving --user-agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'"])
86        # for i in range(3):
87            try:
88            # def tes():
89                print(i, proxy_url)
90                r = session.get(url_2bt,verify=False)#, proxies=proxies)
91                # r = session.get(url_adsterra,verify=False)#, proxies=proxies)
92                # r = session.get(url_ip,verify=False)#, proxies=proxies)
93                print('statbt: ',r.status_code)
94                # proxy_url  ={'server':'proxy.apify.com:8000',  "username": f'groups-RESIDENTIAL,session-new_job_{i}', "password":"apify_proxy_kLL2nn1MA5Wd468LmDvPExVB8Np06o0fryYf"}
95                # proxy_url  ={'server':'proxy.apify.com:8000',  "username": f'groups-RESIDENTIAL', "password":"apify_proxy_kLL2nn1MA5Wd468LmDvPExVB8Np06o0fryYf"}
96                # proxy_url = f'http://groups-RESIDENTIAL,session-new_job_{i}:apify_proxy_kLL2nn1MA5Wd468LmDvPExVB8Np06o0fryYf@proxy.apify.com:8000'
97                # proxy_url = {'server':proxy_url}
98                proxy_url = {'server':proxy_url}
99                # r = session.get(url_aipm,verify=False)#, proxies=proxies)
100                # print('edn: ', r.text)
101                # r = session.get(url_aipm,verify=False)#, proxies=proxies)
102                # print('edn: ', r.text)
103
104                browser = await p.chromium.launch(headless=True,args=["--disable-features=Data-saving --user-agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'"])
105                contex = await browser.new_context(ignore_https_errors=True,proxy=proxy_url)
106                # contex = await browser.new_context(ignore_https_errors=True)                
107                # context.set_cache_enabled(False)
108                page = await contex.new_page()
109                # page.authenticate()
110                # page = await browser.new_page()
111                # await page.goto(url_adsite)
112                await page.route("**/*", intercept_route)
113                await page.goto(url_adsite, timeout=20000)
114                print('urled')
115                r = session.get(url_adsterra,verify=False)
116                # con = await page.content()
117
118                # print('con: ', con)
119                #session.get(url_aipm, verify=False)#, proxies=proxies)
120                #await page.reload()                
121                # session.get(url_adsterra,verify=False)#, proxies=proxies)
122                # print(response.text)
123                print(i, proxy_url)
124            # tes()
125            except Exception as e: 
126                print(e)
127        print('done!!!')
128    print('docmer')
129
130if __name__ == '__main__':
131    asyncio.run(main())
132    print('docn')

.dockerignore

# configurations
.idea

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
.venv

# git folder
.git

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.gitignore

# This file tells Git which files shouldn't be added to source control

.idea
.DS_Store

apify_storage
storage/*
!storage/key_value_stores
storage/key_value_stores/*
!storage/key_value_stores/default
storage/key_value_stores/default/*
!storage/key_value_stores/default/INPUT.json

.venv/
.env/
__pypackages__
dist/
build/
*.egg-info/
*.egg

__pycache__

.mypy_cache
.dmypy.json
dmypy.json
.pytest_cache

.scrapy
*.log

requirements.txt

1# Add your dependencies here.
2# See https://pip.pypa.io/en/latest/reference/requirements-file-format/
3# for how to format them
4apify ~= 1.1.1
5beautifulsoup4 ~= 4.12.0
6requests ~= 2.31.0
7playwright

Google Maps Scraper

compass/crawler-google-places

Extract data from thousands of Google Maps locations and businesses. Get Google Maps data including reviews, reviewer details, images, contact info, opening hours, location, prices & more. Export scraped data, run the scraper via API, schedule and monitor runs, or integrate with other tools.

Compass

121K

4.2

BBB Scraper – Business data & emails

autoscraping/Better-Business-Bureau

Extract 50+ fields, emails, phones, ratings & accreditation from BBB profiles and searches at scale. Export JSON/CSV, schedule runs, proxy rotation & automatic captchas included. Ideal for B2B lead gen, CRM enrichment & supplier checks.

AUTOScraping

5.0

BBB Scraper + Contact Emails 📬

autoscraping/better-business-bureau-monthly

Extract emails, ratings, and contacts from bbb.org for B2B lead gen. 50+ fields. Easy CSV/API output. Pricing $14.95/month

AUTOScraping

Web Scraper

apify/web-scraper

Crawls arbitrary websites using a web browser and extracts structured data from web pages using a provided JavaScript function. The Actor supports both recursive crawling and lists of URLs, and automatically manages concurrency for maximum performance.

Apify

88K

4.5

Cheerio Scraper

apify/cheerio-scraper

Crawls websites using raw HTTP requests, parses the HTML with the Cheerio library, and extracts data from the pages using a Node.js code. Supports both recursive crawling and lists of URLs. This actor is a high-performance alternative to apify/web-scraper for websites that do not require JavaScript.

Apify

8.5K

4.7

Twitter (X.com) Scraper Unlimited ($0.0004 / tweet): No Limits

apidojo/twitter-scraper-lite

Introducing Twitter Scraper Unlimited, the most comprehensive Twitter data extraction solution available. Our enterprise-grade scraper offers unmatched capabilities with a transparent event-based pricing model, making it perfect for both small-scale and large-scale data extraction needs.

API Dojo

8.8K

2.8

🔥 LinkedIn Jobs Scraper

bebity/linkedin-jobs-scraper

ℹ️ Designed for both personal and professional use, simply enter your desired job title and location to receive a tailored list of job opportunities. Try it today!

Bebity

11K

3.3

Youtube Video Downloader

epctex/youtube-video-downloader

Effortlessly download YouTube videos of your preferred quality with our user-friendly Video Downloader. Try it now!

epctex

1.5K

2.5

Linkedin Profile Posts Scraper [NO COOKIES]

apimaestro/linkedin-profile-posts

Scrape LinkedIn posts data for a given LinkedIn profile including post content, reactions, comments count, and media attachments

API Maestro

3.5K

4.7

Puppeteer Scraper

apify/puppeteer-scraper

Crawls websites with the headless Chrome and Puppeteer library using a provided server-side Node.js code. This crawler is an alternative to apify/web-scraper that gives you finer control over the process. Supports both recursive crawling and list of URLs. Supports login to website.

Apify

7.8K

5.0