Tiktok ads scraper
View all Actors
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsTiktok ads scraper
liquid_hardware/tiktok-ads-scraper
.actor/Dockerfile
1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python-selenium:3.11
5
6# Second, copy just requirements.txt into the Actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Use compileall to ensure the runnability of the Actor Python code.
29RUN python3 -m compileall -q .
30
31# Specify how to launch the source code of your Actor.
32# By default, the "python3 -m src" command is run
33CMD ["python3", "-m", "src"]
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "TiktokAdsScrapper",
4 "title": "TikTok Ads Scraper with Selenium",
5 "description": "Scrapes TikTok ads information and displays the data.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "python-selenium"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile",
12 "storages": {
13 "dataset": {
14 "actorSpecification": 1,
15 "fields" : {},
16 "views": {
17 "tiktok-ads-data": {
18 "title": "tiktok-ads-data",
19 "transformation": {
20 "fields": [
21 "ad_id",
22 "ad_advertiser",
23 "first_shown",
24 "last_shown",
25 "unique_user_views",
26 "target_audience",
27 "country_list",
28 "gender",
29 "age",
30 "additional_parameters",
31 "video_link"
32 ]
33 },
34 "display": {
35 "component": "table",
36 "properties": {
37 "ad_id": {
38 "label": "Ad ID",
39 "format": "text"
40 },
41 "ad_advertiser": {
42 "label": "Advertiser",
43 "format": "text"
44 },
45 "first_shown": {
46 "label": "First Shown",
47 "format": "text"
48 },
49 "last_shown": {
50 "label": "Last Shown",
51 "format": "text"
52 },
53 "unique_user_views": {
54 "label": "Unique User Views",
55 "format": "text"
56 },
57 "target_audience": {
58 "label": "Target Audience",
59 "format": "text"
60 },
61 "country_list": {
62 "label": "Country List",
63 "format": "text"
64 },
65 "gender": {
66 "label": "Gender Distribution",
67 "format": "text"
68 },
69 "age": {
70 "label": "Age Distribution",
71 "format": "text"
72 },
73 "additional_parameters": {
74 "label": "Additional Parameters",
75 "format": "text"
76 },
77 "video_link": {
78 "label": "Video Link",
79 "format": "link"
80 }
81 }
82 }
83 }
84 }
85 }
86 }
87}
.actor/input_schema.json
1{
2 "title": "Python Selenium Scraper",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "start_urls": {
7 "title": "Start URLs",
8 "type": "array",
9 "description": "URLs to start with",
10 "prefill": [
11 { "url": "https://apify.com" }
12 ],
13 "editor": "requestListSources"
14 },
15 "max_depth": {
16 "title": "Maximum depth",
17 "type": "integer",
18 "description": "Depth to which to scrape to",
19 "default": 1
20 }
21 },
22 "required": ["start_urls"]
23}
src/__main__.py
1"""
2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging
3settings. The `main()` coroutine is then executed using `asyncio.run()`.
4
5Feel free to modify this file to suit your specific needs.
6"""
7
8import asyncio
9import logging
10
11from apify.log import ActorLogFormatter
12
13from .main import main
14
15# Configure loggers
16handler = logging.StreamHandler()
17handler.setFormatter(ActorLogFormatter())
18
19apify_client_logger = logging.getLogger('apify_client')
20apify_client_logger.setLevel(logging.INFO)
21apify_client_logger.addHandler(handler)
22
23apify_logger = logging.getLogger('apify')
24apify_logger.setLevel(logging.DEBUG)
25apify_logger.addHandler(handler)
26
27# Execute the Actor main coroutine
28asyncio.run(main())
src/main.py
1import requests
2import csv
3import time
4import logging
5from bs4 import BeautifulSoup
6from selenium import webdriver
7from selenium.webdriver.common.by import By
8from selenium.webdriver.support.ui import WebDriverWait
9from selenium.webdriver.support import expected_conditions as EC
10from selenium.common.exceptions import TimeoutException, NoSuchElementException
11from apify import Actor
12
13
14async def main():
15 async with Actor() as actor:
16 input_data = await actor.get_input() or {}
17 start_urls = input_data.get('start_urls', [])
18 max_depth = input_data.get('max_depth', 1)
19
20 dataset = await actor.open_dataset(name='tiktok-ads-data')
21
22 for url_obj in start_urls:
23 start_url = url_obj.get('url')
24 if not start_url:
25 continue
26 # start_url="https://library.tiktok.com/ads?region=FR&start_time=1712082600000&end_time=1712169000000&adv_name=fashion&adv_biz_ids=&query_type=1&sort_type=last_shown_date,desc"
27
28 chrome_options = webdriver.ChromeOptions()
29 chrome_options.add_argument('--headless')
30 chrome_options.add_argument('--no-sandbox')
31 chrome_options.add_argument('--disable-dev-shm-usage')
32
33
34 # Instantiate WebDriver
35 driver = webdriver.Chrome(chrome_options)
36 driver.get(start_url)
37
38 try:
39 driver.get(start_url)
40 wait = WebDriverWait(driver, 10) # Creating WebDriverWait instance
41 while True:
42 # Try clicking the "View More" button
43 view_more_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".loading_more")))
44 driver.execute_script("arguments[0].scrollIntoView();", view_more_button) # Scroll to the button
45 driver.execute_script("window.scrollBy(0, -100);")
46 driver.execute_script("arguments[0].click();", view_more_button)
47 # Optional: Wait for the content to load
48 WebDriverWait(driver, 2).until(lambda d: d.find_element(By.CSS_SELECTOR, ".ad_card"))
49
50 except (TimeoutException, NoSuchElementException):
51 actor.log.info("All content loaded or button not found.")
52
53 html= driver.page_source
54 driver.quit()
55
56 soup = BeautifulSoup(html, 'html.parser')
57 ad_links = soup.find_all('a', class_='link')
58 # print(ad_links)
59
60 ad_ids = [link['href'].split('=')[-1] for link in ad_links]
61 # print(ad_ids)
62
63
64
65 base_url = 'https://library.tiktok.com/ads/detail/?ad_id='
66 headers = {
67 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
68 }
69
70 for ad_id in ad_ids:
71 ad_url = base_url + ad_id
72 driver = webdriver.Chrome(chrome_options)
73 driver.get(ad_url)
74
75 time.sleep(2)
76 ad_html = driver.page_source
77 ad_soup = BeautifulSoup(ad_html, 'html.parser')
78 details_tags = ad_soup.find_all('span', {'class': 'item_value'})
79
80 advertiser = ad_soup.find('div', {'class': 'ad_advertiser_value'})
81 if advertiser is None:
82 advertiser = f"no advertiser available for ad_id: {ad_id}"
83 else:
84 advertiser = advertiser.text
85
86 video_link_tag = ad_soup.find('video')
87 if video_link_tag is None:
88 video_link = f"no video available for ad_id: {ad_id}"
89 else:
90 video_link = video_link_tag['src']
91
92 target_audience = ad_soup.find('span', {'class': 'ad_target_audience_size_value'})
93 if target_audience is None:
94 target_audience = f"no views available for ad_id: {ad_id}"
95 else:
96 target_audience = target_audience.text
97
98 details_list = []
99 for detail in details_tags:
100 details_list.append(detail.text)
101
102 rows = ad_soup.find_all('tbody', class_='byted-Table-Body')
103
104 gender = []
105 age = []
106 country_list = []
107 addn_parameters = []
108
109 countries = rows[0].find_all('tr')
110 for c in countries:
111 cells = c.find_all('td')
112 country = cells[1].text.strip()
113 country_list.append(country)
114 male, female, unknown = True, True, True
115 if cells[2].find('path')['d'] == "M6 23a1 1 0 0 1 1-1h34a1 1 0 0 1 1 1v2a1 1 0 0 1-1 1H7a1 1 0 0 1-1-1v-2Z":
116 male = False
117 if cells[3].find('path')['d'] == "M6 23a1 1 0 0 1 1-1h34a1 1 0 0 1 1 1v2a1 1 0 0 1-1 1H7a1 1 0 0 1-1-1v-2Z":
118 female = False
119 if cells[4].find('path')['d'] == "M6 23a1 1 0 0 1 1-1h34a1 1 0 0 1 1 1v2a1 1 0 0 1-1 1H7a1 1 0 0 1-1-1v-2Z":
120 unknown = False
121
122 entry = {
123 'country': country,
124 'gender': {
125 'Male': male,
126 'Female': female,
127 'Unknown': unknown
128 }
129 }
130 gender.append(entry)
131
132 countries = rows[1].find_all('tr')
133 for c in countries:
134 cells = c.find_all('td')
135 country = cells[1].text.strip()
136 ages = [True] * 6
137 for i in range(6):
138 if cells[2 + i].find('path')['d'] == "M6 23a1 1 0 0 1 1-1h34a1 1 0 0 1 1 1v2a1 1 0 0 1-1 1H7a1 1 0 0 1-1-1v-2Z":
139 ages[i] = False
140
141 entry = {
142 'country': country,
143 'ages': {
144 '13-17': ages[0],
145 '18-24': ages[1],
146 '25-34': ages[2],
147 '35-44': ages[3],
148 '45-54': ages[4],
149 '55+': ages[5],
150 }
151 }
152 age.append(entry)
153
154 param_rows = ad_soup.find_all('tr', class_="targeting_additional_parameters_table_row")
155
156 entry = {}
157 for p in param_rows:
158 param = p.find('td', class_="targeting_additional_parameters_table_first_col")
159 status = p.find('td', class_='')
160 if status is not None:
161 entry[param.text] = status.text
162 else:
163 entry[param.text] = 'None'
164
165 addn_parameters.append(entry)
166 await dataset.push_data({'ad_id':ad_id,'ad_advertiser': advertiser, 'first_shown':details_list[0], 'last_shown':details_list[1], 'unique_user_views':details_list[2], 'target_audience':target_audience,
167 'country_list':country_list,'gender': gender, 'age':age, 'additional_parameters':addn_parameters, 'video_link':video_link})
168
169 driver.quit()
170
171
172 # Disable urllib3 warnings
173
174 # Export the data as CSV and JSON
175 await dataset.export_to_csv('data.csv', to_key_value_store_name='my-key-value-store')
176
177 # Print the exported records
178 store = await actor.open_key_value_store(name='my-key-value-store')
179 print(await store.get_value('data.csv'))
180 # print(csv_data)
181 actor.log.info(f"Ad IDs: {ad_ids}")
182
183
184def download_video(url, ad_id):
185 response = requests.get(url, headers=headers, stream=True)
186 if response.status_code == 200:
187 filename = f"{str(ad_id)}.mp4"
188 with open(filename, "wb") as f:
189 for chunk in response.iter_content(chunk_size=1024):
190 if chunk:
191 f.write(chunk)
192 actor.log.info(f"{filename} downloaded successfully.")
193 else:
194 actor.log.info(f"Failed to download video from {url}. Status code: {response.status_code}")
195
196
197if __name__ == "__main__":
198 asyncio.run(main())
199
200 # Example usage:
201 # url = 'https://library.tiktok.com/api/v1/cdn/1712151504/video/aHR0cHM6Ly92NzcudGlrdG9rY2RuLmNvbS8zZjJiOWU5YmNhOGRlMGJjZjA3YmIwYWRiN2E3ZjE4Yi82NjBkYjAzOS92aWRlby90b3MvdXNlYXN0MmEvdG9zLXVzZWFzdDJhLXZlLTAwNjgtZXV0dHAvb01JRUtGbWVzQkM2RFVEOUVmeFJRUFFRRVd3aWdDaWRzNEpBUnQv/7cd4caab-916d-44c8-b3b9-34c155a810e1?a=475769&bti=PDU2NmYwMy86&ch=0&cr=0&dr=1&cd=0%7C0%7C0%7C0&cv=1&br=3876&bt=1938&cs=0&ds=6&ft=.NpOcInz7Th4FkErXq8Zmo&mime_type=video_mp4&qs=0&rc=aGg7ZmY3NTdnNDxnOzNoZ0BpM3RsO2s5cnFmcjMzZjczM0BjLmJiLTUxXjQxNDUxXy9hYSNkbWZjMmRjMTJgLS1kMWNzcw%3D%3D&vvpl=1&l=20240403133823D8EDE6F40EAE406831A7&btag=e00088000&cc=13'
202 # ad_id = 123
203 # download_video(url, ad_id)
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10.venv
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.DS_Store
5
6apify_storage
7storage
8
9.venv/
10.env/
11__pypackages__
12dist/
13build/
14*.egg-info/
15*.egg
16
17__pycache__
18
19.mypy_cache
20.dmypy.json
21dmypy.json
22.pytest_cache
23.ruff_cache
24
25.scrapy
26*.log
requirements.txt
1beautifulsoup4
2pandas
3selenium
4requests
Developer
Maintained by Community
Categories