req
Deprecated
Pricing
Pay per usage
Go to Store
req
Deprecated
0.0 (0)
Pricing
Pay per usage
1
Total users
3
Monthly users
3
Last modified
2 years ago
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.#FROM apify/actor-python:3.11FROM apify/actor-python-playwright:3.11# Second, copy just requirements.txt into the actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze \ && playwright install chromium #&& playwright install-deps
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Specify how to launch the source code of your actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "my-actor-1", "title": "Scrape single page in Python", "description": "Scrape data from single page with provided URL.", "version": "0.0", "meta": { "templateId": "python-start" }, // "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Scrape data from a web page", "type": "object", "schemaVersion": 1, "properties": { "url": { "title": "URL of the page", "type": "string", "description": "The URL of website you want to get the data from.", "editor": "textfield", "prefill": "https://www.apify.com/" } }, "required": ["url"]}
src/__init__.py
1
src/__main__.py
1import asyncio2import logging3
4from apify.log import ActorLogFormatter5
6from .main import main7
8# Set up logging of messages from the Apify SDK9handler = logging.StreamHandler()10handler.setFormatter(ActorLogFormatter())11
12apify_client_logger = logging.getLogger('apify_client')13apify_client_logger.setLevel(logging.INFO)14apify_client_logger.addHandler(handler)15
16apify_logger = logging.getLogger('apify')17apify_logger.setLevel(logging.DEBUG)18apify_logger.addHandler(handler)19
20asyncio.run(main())
src/main.py
1# Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/python).2# Requests - library for making HTTP requests in Python (Read more at https://requests.readthedocs.io)3# Beautiful Soup - library for pulling data out of HTML and XML files (Read more at https://www.crummy.com/software/BeautifulSoup/bs4/doc)4# from bs4 import BeautifulSoup5import requests, asyncio6from apify import Actor7import playwright8from playwright.async_api import async_playwright9print('................reqbot...............0')10
11url_adsite = 'https://a000.ex16.repl.co/'12url_ip = 'https://whatsmyip.com/'13url_aclick = 'https://click.a-ads.com/2199641/2/'14url_aipm = 'https://ad.a-ads.com/2199641?size=728x90'15url_2bt = 'http://traffic2bitcoin.com/ptp2.php?ref=Exash'16# url_adsterra = 'https://www.highcpmrevenuegate.com/dahcgmdy89?key=cbcfa1e9a4c631faf6ed1b29519abfce'17url_adsterra = 'https://www.toprevenuegate.com/dahcgmdy89?key=cbcfa1e9a4c631faf6ed1b29519abfce'18
19async def main():20 async with Actor:21 # from playwright.async_api import async_playwright22 p = await async_playwright().start()23 BLOCK_RESOURCE_TYPES = [24 'beacon',25 'csp_report',26 'font',27 'image',28 'imageset',29 'media',30 'object',31 'texttrack',32 # we can even block stylsheets and scripts though it's not recommended:33 # 'stylesheet',34 # 'script', 35 # 'xhr',36 ]37
38
39 # we can also block popular 3rd party resources like tracking and advertisements.40 BLOCK_RESOURCE_NAMES = [41 'adzerk',42 'analytics',43 'cdn.api.twitter',44 'doubleclick',45 'exelator',46 'facebook',47 'fontawesome',48 'google',49 'google-analytics',50 'googletagmanager',51 ]52
53 def intercept_route(route):54 """intercept all requests and abort blocked ones"""55 if route.request.resource_type in BLOCK_RESOURCE_TYPES:56 print(f'blocking background resource {route.request} blocked type "{route.request.resource_type}"')57 return route.abort()58 if any(key in route.request.url for key in BLOCK_RESOURCE_NAMES):59 print(f"blocking background resource {route.request} blocked name {route.request.url}")60 return route.abort()61 return route.continue_()62
63 print('................reqbot...............')64 # browser = p.chromium.launch(headless=True, ,proxy={'server':proxy_url})65 # page = browser.new_page()66 # page.goto(website)67 # async with async_playwright() as p:68 # input = await Actor.get_input()69 # proxy_configuration = await Actor.create_proxy_configuration()70 # proxy_configuration = await Actor.create_proxy_configuration(groups=['RESIDENTIAL'])71 for i in range(1000):72 proxy_configuration = await Actor.create_proxy_configuration(groups=[f'RESIDENTIAL'], country_code = 'FR')73 proxy_url = await proxy_configuration.new_url(str(i))74 print(proxy_url)75 # proxy_url = f'http://groups-RESIDENTIAL:apify_proxy_kLL2nn1MA5Wd468LmDvPExVB8Np06o0fryYf@proxy.apify.com:8000'76 proxies = {77 'http': proxy_url,78 'https': proxy_url,79 }80 # response = requests.get(input['url'])81 session = requests.Session()82 session.proxies = proxies83
84 # browser = await p.chromium.launch(headless=True, proxy={'server':proxy_url}, args=["--disable-features=Data-saving --user-agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'"])85 # browser = await p.chromium.launch(headless=True, args=["--disable-features=Data-saving --user-agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'"])86 # for i in range(3):87 try:88 # def tes():89 print(i, proxy_url)90 r = session.get(url_2bt,verify=False)#, proxies=proxies)91 # r = session.get(url_adsterra,verify=False)#, proxies=proxies)92 # r = session.get(url_ip,verify=False)#, proxies=proxies)93 print('statbt: ',r.status_code)94 # proxy_url ={'server':'proxy.apify.com:8000', "username": f'groups-RESIDENTIAL,session-new_job_{i}', "password":"apify_proxy_kLL2nn1MA5Wd468LmDvPExVB8Np06o0fryYf"}95 # proxy_url ={'server':'proxy.apify.com:8000', "username": f'groups-RESIDENTIAL', "password":"apify_proxy_kLL2nn1MA5Wd468LmDvPExVB8Np06o0fryYf"}96 # proxy_url = f'http://groups-RESIDENTIAL,session-new_job_{i}:apify_proxy_kLL2nn1MA5Wd468LmDvPExVB8Np06o0fryYf@proxy.apify.com:8000'97 # proxy_url = {'server':proxy_url}98 proxy_url = {'server':proxy_url}99 # r = session.get(url_aipm,verify=False)#, proxies=proxies)100 # print('edn: ', r.text)101 # r = session.get(url_aipm,verify=False)#, proxies=proxies)102 # print('edn: ', r.text)103
104 browser = await p.chromium.launch(headless=True,args=["--disable-features=Data-saving --user-agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'"])105 contex = await browser.new_context(ignore_https_errors=True,proxy=proxy_url)106 # contex = await browser.new_context(ignore_https_errors=True) 107 # context.set_cache_enabled(False)108 page = await contex.new_page()109 # page.authenticate()110 # page = await browser.new_page()111 # await page.goto(url_adsite)112 await page.route("**/*", intercept_route)113 await page.goto(url_adsite, timeout=20000)114 print('urled')115 r = session.get(url_adsterra,verify=False)116 # con = await page.content()117
118 # print('con: ', con)119 #session.get(url_aipm, verify=False)#, proxies=proxies)120 #await page.reload() 121 # session.get(url_adsterra,verify=False)#, proxies=proxies)122 # print(response.text)123 print(i, proxy_url)124 # tes()125 except Exception as e: 126 print(e)127 print('done!!!')128 print('docmer')129
130if __name__ == '__main__':131 asyncio.run(main())132 print('docn')
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed files.venv
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.gitignore
# This file tells Git which files shouldn't be added to source control
.idea.DS_Store
apify_storagestorage/*!storage/key_value_storesstorage/key_value_stores/*!storage/key_value_stores/defaultstorage/key_value_stores/default/*!storage/key_value_stores/default/INPUT.json
.venv/.env/__pypackages__dist/build/*.egg-info/*.egg
__pycache__
.mypy_cache.dmypy.jsondmypy.json.pytest_cache
.scrapy*.log
requirements.txt
1# Add your dependencies here.2# See https://pip.pypa.io/en/latest/reference/requirements-file-format/3# for how to format them4apify ~= 1.1.15beautifulsoup4 ~= 4.12.06requests ~= 2.31.07playwright