req avatar
req

Deprecated

Pricing

Pay per usage

Go to Store
req

req

Deprecated

Developed by

forbit

forbit

Maintained by Community

0.0 (0)

Pricing

Pay per usage

1

Total users

3

Monthly users

3

Last modified

2 years ago

.actor/Dockerfile

# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
#FROM apify/actor-python:3.11
FROM apify/actor-python-playwright:3.11
# Second, copy just requirements.txt into the actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./
# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
&& python --version \
&& echo "Pip version:" \
&& pip --version \
&& echo "Installing dependencies:" \
&& pip install -r requirements.txt \
&& echo "All installed Python packages:" \
&& pip freeze \
&& playwright install chromium
#&& playwright install-deps
# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./
# Specify how to launch the source code of your actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
"actorSpecification": 1,
"name": "my-actor-1",
"title": "Scrape single page in Python",
"description": "Scrape data from single page with provided URL.",
"version": "0.0",
"meta": {
"templateId": "python-start"
},
// "input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"title": "Scrape data from a web page",
"type": "object",
"schemaVersion": 1,
"properties": {
"url": {
"title": "URL of the page",
"type": "string",
"description": "The URL of website you want to get the data from.",
"editor": "textfield",
"prefill": "https://www.apify.com/"
}
},
"required": ["url"]
}

src/__init__.py

1

src/__main__.py

1import asyncio
2import logging
3
4from apify.log import ActorLogFormatter
5
6from .main import main
7
8# Set up logging of messages from the Apify SDK
9handler = logging.StreamHandler()
10handler.setFormatter(ActorLogFormatter())
11
12apify_client_logger = logging.getLogger('apify_client')
13apify_client_logger.setLevel(logging.INFO)
14apify_client_logger.addHandler(handler)
15
16apify_logger = logging.getLogger('apify')
17apify_logger.setLevel(logging.DEBUG)
18apify_logger.addHandler(handler)
19
20asyncio.run(main())

src/main.py

1# Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/python).
2# Requests - library for making HTTP requests in Python (Read more at https://requests.readthedocs.io)
3# Beautiful Soup - library for pulling data out of HTML and XML files (Read more at https://www.crummy.com/software/BeautifulSoup/bs4/doc)
4# from bs4 import BeautifulSoup
5import requests, asyncio
6from apify import Actor
7import playwright
8from playwright.async_api import async_playwright
9print('................reqbot...............0')
10
11url_adsite = 'https://a000.ex16.repl.co/'
12url_ip = 'https://whatsmyip.com/'
13url_aclick = 'https://click.a-ads.com/2199641/2/'
14url_aipm = 'https://ad.a-ads.com/2199641?size=728x90'
15url_2bt = 'http://traffic2bitcoin.com/ptp2.php?ref=Exash'
16# url_adsterra = 'https://www.highcpmrevenuegate.com/dahcgmdy89?key=cbcfa1e9a4c631faf6ed1b29519abfce'
17url_adsterra = 'https://www.toprevenuegate.com/dahcgmdy89?key=cbcfa1e9a4c631faf6ed1b29519abfce'
18
19async def main():
20 async with Actor:
21 # from playwright.async_api import async_playwright
22 p = await async_playwright().start()
23 BLOCK_RESOURCE_TYPES = [
24 'beacon',
25 'csp_report',
26 'font',
27 'image',
28 'imageset',
29 'media',
30 'object',
31 'texttrack',
32 # we can even block stylsheets and scripts though it's not recommended:
33 # 'stylesheet',
34 # 'script',
35 # 'xhr',
36 ]
37
38
39 # we can also block popular 3rd party resources like tracking and advertisements.
40 BLOCK_RESOURCE_NAMES = [
41 'adzerk',
42 'analytics',
43 'cdn.api.twitter',
44 'doubleclick',
45 'exelator',
46 'facebook',
47 'fontawesome',
48 'google',
49 'google-analytics',
50 'googletagmanager',
51 ]
52
53 def intercept_route(route):
54 """intercept all requests and abort blocked ones"""
55 if route.request.resource_type in BLOCK_RESOURCE_TYPES:
56 print(f'blocking background resource {route.request} blocked type "{route.request.resource_type}"')
57 return route.abort()
58 if any(key in route.request.url for key in BLOCK_RESOURCE_NAMES):
59 print(f"blocking background resource {route.request} blocked name {route.request.url}")
60 return route.abort()
61 return route.continue_()
62
63 print('................reqbot...............')
64 # browser = p.chromium.launch(headless=True, ,proxy={'server':proxy_url})
65 # page = browser.new_page()
66 # page.goto(website)
67 # async with async_playwright() as p:
68 # input = await Actor.get_input()
69 # proxy_configuration = await Actor.create_proxy_configuration()
70 # proxy_configuration = await Actor.create_proxy_configuration(groups=['RESIDENTIAL'])
71 for i in range(1000):
72 proxy_configuration = await Actor.create_proxy_configuration(groups=[f'RESIDENTIAL'], country_code = 'FR')
73 proxy_url = await proxy_configuration.new_url(str(i))
74 print(proxy_url)
75 # proxy_url = f'http://groups-RESIDENTIAL:apify_proxy_kLL2nn1MA5Wd468LmDvPExVB8Np06o0fryYf@proxy.apify.com:8000'
76 proxies = {
77 'http': proxy_url,
78 'https': proxy_url,
79 }
80 # response = requests.get(input['url'])
81 session = requests.Session()
82 session.proxies = proxies
83
84 # browser = await p.chromium.launch(headless=True, proxy={'server':proxy_url}, args=["--disable-features=Data-saving --user-agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'"])
85 # browser = await p.chromium.launch(headless=True, args=["--disable-features=Data-saving --user-agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'"])
86 # for i in range(3):
87 try:
88 # def tes():
89 print(i, proxy_url)
90 r = session.get(url_2bt,verify=False)#, proxies=proxies)
91 # r = session.get(url_adsterra,verify=False)#, proxies=proxies)
92 # r = session.get(url_ip,verify=False)#, proxies=proxies)
93 print('statbt: ',r.status_code)
94 # proxy_url ={'server':'proxy.apify.com:8000', "username": f'groups-RESIDENTIAL,session-new_job_{i}', "password":"apify_proxy_kLL2nn1MA5Wd468LmDvPExVB8Np06o0fryYf"}
95 # proxy_url ={'server':'proxy.apify.com:8000', "username": f'groups-RESIDENTIAL', "password":"apify_proxy_kLL2nn1MA5Wd468LmDvPExVB8Np06o0fryYf"}
96 # proxy_url = f'http://groups-RESIDENTIAL,session-new_job_{i}:apify_proxy_kLL2nn1MA5Wd468LmDvPExVB8Np06o0fryYf@proxy.apify.com:8000'
97 # proxy_url = {'server':proxy_url}
98 proxy_url = {'server':proxy_url}
99 # r = session.get(url_aipm,verify=False)#, proxies=proxies)
100 # print('edn: ', r.text)
101 # r = session.get(url_aipm,verify=False)#, proxies=proxies)
102 # print('edn: ', r.text)
103
104 browser = await p.chromium.launch(headless=True,args=["--disable-features=Data-saving --user-agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'"])
105 contex = await browser.new_context(ignore_https_errors=True,proxy=proxy_url)
106 # contex = await browser.new_context(ignore_https_errors=True)
107 # context.set_cache_enabled(False)
108 page = await contex.new_page()
109 # page.authenticate()
110 # page = await browser.new_page()
111 # await page.goto(url_adsite)
112 await page.route("**/*", intercept_route)
113 await page.goto(url_adsite, timeout=20000)
114 print('urled')
115 r = session.get(url_adsterra,verify=False)
116 # con = await page.content()
117
118 # print('con: ', con)
119 #session.get(url_aipm, verify=False)#, proxies=proxies)
120 #await page.reload()
121 # session.get(url_adsterra,verify=False)#, proxies=proxies)
122 # print(response.text)
123 print(i, proxy_url)
124 # tes()
125 except Exception as e:
126 print(e)
127 print('done!!!')
128 print('docmer')
129
130if __name__ == '__main__':
131 asyncio.run(main())
132 print('docn')

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
.venv
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.gitignore

# This file tells Git which files shouldn't be added to source control
.idea
.DS_Store
apify_storage
storage/*
!storage/key_value_stores
storage/key_value_stores/*
!storage/key_value_stores/default
storage/key_value_stores/default/*
!storage/key_value_stores/default/INPUT.json
.venv/
.env/
__pypackages__
dist/
build/
*.egg-info/
*.egg
__pycache__
.mypy_cache
.dmypy.json
dmypy.json
.pytest_cache
.scrapy
*.log

requirements.txt

1# Add your dependencies here.
2# See https://pip.pypa.io/en/latest/reference/requirements-file-format/
3# for how to format them
4apify ~= 1.1.1
5beautifulsoup4 ~= 4.12.0
6requests ~= 2.31.0
7playwright