Tiktok ads scraper avatar
Tiktok ads scraper

Deprecated

Pricing

Pay per usage

Go to Store
Tiktok ads scraper

Tiktok ads scraper

Deprecated

Developed by

Tarun

Tarun

Maintained by Community

0.0 (0)

Pricing

Pay per usage

1

Total users

2

Monthly users

2

Last modified

a year ago

.actor/Dockerfile

# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python-selenium:3.11
# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./
# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
&& python --version \
&& echo "Pip version:" \
&& pip --version \
&& echo "Installing dependencies:" \
&& pip install -r requirements.txt \
&& echo "All installed Python packages:" \
&& pip freeze
# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.
RUN python3 -m compileall -q .
# Specify how to launch the source code of your Actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
"actorSpecification": 1,
"name": "TiktokAdsScrapper",
"title": "TikTok Ads Scraper with Selenium",
"description": "Scrapes TikTok ads information and displays the data.",
"version": "0.0",
"meta": {
"templateId": "python-selenium"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile",
"storages": {
"dataset": {
"actorSpecification": 1,
"fields" : {},
"views": {
"tiktok-ads-data": {
"title": "tiktok-ads-data",
"transformation": {
"fields": [
"ad_id",
"ad_advertiser",
"first_shown",
"last_shown",
"unique_user_views",
"target_audience",
"country_list",
"gender",
"age",
"additional_parameters",
"video_link"
]
},
"display": {
"component": "table",
"properties": {
"ad_id": {
"label": "Ad ID",
"format": "text"
},
"ad_advertiser": {
"label": "Advertiser",
"format": "text"
},
"first_shown": {
"label": "First Shown",
"format": "text"
},
"last_shown": {
"label": "Last Shown",
"format": "text"
},
"unique_user_views": {
"label": "Unique User Views",
"format": "text"
},
"target_audience": {
"label": "Target Audience",
"format": "text"
},
"country_list": {
"label": "Country List",
"format": "text"
},
"gender": {
"label": "Gender Distribution",
"format": "text"
},
"age": {
"label": "Age Distribution",
"format": "text"
},
"additional_parameters": {
"label": "Additional Parameters",
"format": "text"
},
"video_link": {
"label": "Video Link",
"format": "link"
}
}
}
}
}
}
}
}

.actor/input_schema.json

{
"title": "Python Selenium Scraper",
"type": "object",
"schemaVersion": 1,
"properties": {
"start_urls": {
"title": "Start URLs",
"type": "array",
"description": "URLs to start with",
"prefill": [
{ "url": "https://apify.com" }
],
"editor": "requestListSources"
},
"max_depth": {
"title": "Maximum depth",
"type": "integer",
"description": "Depth to which to scrape to",
"default": 1
}
},
"required": ["start_urls"]
}

src/__main__.py

1"""
2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging
3settings. The `main()` coroutine is then executed using `asyncio.run()`.
4
5Feel free to modify this file to suit your specific needs.
6"""
7
8import asyncio
9import logging
10
11from apify.log import ActorLogFormatter
12
13from .main import main
14
15# Configure loggers
16handler = logging.StreamHandler()
17handler.setFormatter(ActorLogFormatter())
18
19apify_client_logger = logging.getLogger('apify_client')
20apify_client_logger.setLevel(logging.INFO)
21apify_client_logger.addHandler(handler)
22
23apify_logger = logging.getLogger('apify')
24apify_logger.setLevel(logging.DEBUG)
25apify_logger.addHandler(handler)
26
27# Execute the Actor main coroutine
28asyncio.run(main())

src/main.py

1import requests
2import csv
3import time
4import logging
5from bs4 import BeautifulSoup
6from selenium import webdriver
7from selenium.webdriver.common.by import By
8from selenium.webdriver.support.ui import WebDriverWait
9from selenium.webdriver.support import expected_conditions as EC
10from selenium.common.exceptions import TimeoutException, NoSuchElementException
11from apify import Actor
12
13
14async def main():
15 async with Actor() as actor:
16 input_data = await actor.get_input() or {}
17 start_urls = input_data.get('start_urls', [])
18 max_depth = input_data.get('max_depth', 1)
19
20 dataset = await actor.open_dataset(name='tiktok-ads-data')
21
22 for url_obj in start_urls:
23 start_url = url_obj.get('url')
24 if not start_url:
25 continue
26 # start_url="https://library.tiktok.com/ads?region=FR&start_time=1712082600000&end_time=1712169000000&adv_name=fashion&adv_biz_ids=&query_type=1&sort_type=last_shown_date,desc"
27
28 chrome_options = webdriver.ChromeOptions()
29 chrome_options.add_argument('--headless')
30 chrome_options.add_argument('--no-sandbox')
31 chrome_options.add_argument('--disable-dev-shm-usage')
32
33
34 # Instantiate WebDriver
35 driver = webdriver.Chrome(chrome_options)
36 driver.get(start_url)
37
38 try:
39 driver.get(start_url)
40 wait = WebDriverWait(driver, 10) # Creating WebDriverWait instance
41 while True:
42 # Try clicking the "View More" button
43 view_more_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".loading_more")))
44 driver.execute_script("arguments[0].scrollIntoView();", view_more_button) # Scroll to the button
45 driver.execute_script("window.scrollBy(0, -100);")
46 driver.execute_script("arguments[0].click();", view_more_button)
47 # Optional: Wait for the content to load
48 WebDriverWait(driver, 2).until(lambda d: d.find_element(By.CSS_SELECTOR, ".ad_card"))
49
50 except (TimeoutException, NoSuchElementException):
51 actor.log.info("All content loaded or button not found.")
52
53 html= driver.page_source
54 driver.quit()
55
56 soup = BeautifulSoup(html, 'html.parser')
57 ad_links = soup.find_all('a', class_='link')
58 # print(ad_links)
59
60 ad_ids = [link['href'].split('=')[-1] for link in ad_links]
61 # print(ad_ids)
62
63
64
65 base_url = 'https://library.tiktok.com/ads/detail/?ad_id='
66 headers = {
67 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
68 }
69
70 for ad_id in ad_ids:
71 ad_url = base_url + ad_id
72 driver = webdriver.Chrome(chrome_options)
73 driver.get(ad_url)
74
75 time.sleep(2)
76 ad_html = driver.page_source
77 ad_soup = BeautifulSoup(ad_html, 'html.parser')
78 details_tags = ad_soup.find_all('span', {'class': 'item_value'})
79
80 advertiser = ad_soup.find('div', {'class': 'ad_advertiser_value'})
81 if advertiser is None:
82 advertiser = f"no advertiser available for ad_id: {ad_id}"
83 else:
84 advertiser = advertiser.text
85
86 video_link_tag = ad_soup.find('video')
87 if video_link_tag is None:
88 video_link = f"no video available for ad_id: {ad_id}"
89 else:
90 video_link = video_link_tag['src']
91
92 target_audience = ad_soup.find('span', {'class': 'ad_target_audience_size_value'})
93 if target_audience is None:
94 target_audience = f"no views available for ad_id: {ad_id}"
95 else:
96 target_audience = target_audience.text
97
98 details_list = []
99 for detail in details_tags:
100 details_list.append(detail.text)
101
102 rows = ad_soup.find_all('tbody', class_='byted-Table-Body')
103
104 gender = []
105 age = []
106 country_list = []
107 addn_parameters = []
108
109 countries = rows[0].find_all('tr')
110 for c in countries:
111 cells = c.find_all('td')
112 country = cells[1].text.strip()
113 country_list.append(country)
114 male, female, unknown = True, True, True
115 if cells[2].find('path')['d'] == "M6 23a1 1 0 0 1 1-1h34a1 1 0 0 1 1 1v2a1 1 0 0 1-1 1H7a1 1 0 0 1-1-1v-2Z":
116 male = False
117 if cells[3].find('path')['d'] == "M6 23a1 1 0 0 1 1-1h34a1 1 0 0 1 1 1v2a1 1 0 0 1-1 1H7a1 1 0 0 1-1-1v-2Z":
118 female = False
119 if cells[4].find('path')['d'] == "M6 23a1 1 0 0 1 1-1h34a1 1 0 0 1 1 1v2a1 1 0 0 1-1 1H7a1 1 0 0 1-1-1v-2Z":
120 unknown = False
121
122 entry = {
123 'country': country,
124 'gender': {
125 'Male': male,
126 'Female': female,
127 'Unknown': unknown
128 }
129 }
130 gender.append(entry)
131
132 countries = rows[1].find_all('tr')
133 for c in countries:
134 cells = c.find_all('td')
135 country = cells[1].text.strip()
136 ages = [True] * 6
137 for i in range(6):
138 if cells[2 + i].find('path')['d'] == "M6 23a1 1 0 0 1 1-1h34a1 1 0 0 1 1 1v2a1 1 0 0 1-1 1H7a1 1 0 0 1-1-1v-2Z":
139 ages[i] = False
140
141 entry = {
142 'country': country,
143 'ages': {
144 '13-17': ages[0],
145 '18-24': ages[1],
146 '25-34': ages[2],
147 '35-44': ages[3],
148 '45-54': ages[4],
149 '55+': ages[5],
150 }
151 }
152 age.append(entry)
153
154 param_rows = ad_soup.find_all('tr', class_="targeting_additional_parameters_table_row")
155
156 entry = {}
157 for p in param_rows:
158 param = p.find('td', class_="targeting_additional_parameters_table_first_col")
159 status = p.find('td', class_='')
160 if status is not None:
161 entry[param.text] = status.text
162 else:
163 entry[param.text] = 'None'
164
165 addn_parameters.append(entry)
166 await dataset.push_data({'ad_id':ad_id,'ad_advertiser': advertiser, 'first_shown':details_list[0], 'last_shown':details_list[1], 'unique_user_views':details_list[2], 'target_audience':target_audience,
167 'country_list':country_list,'gender': gender, 'age':age, 'additional_parameters':addn_parameters, 'video_link':video_link})
168
169 driver.quit()
170
171
172 # Disable urllib3 warnings
173
174 # Export the data as CSV and JSON
175 await dataset.export_to_csv('data.csv', to_key_value_store_name='my-key-value-store')
176
177 # Print the exported records
178 store = await actor.open_key_value_store(name='my-key-value-store')
179 print(await store.get_value('data.csv'))
180 # print(csv_data)
181 actor.log.info(f"Ad IDs: {ad_ids}")
182
183
184def download_video(url, ad_id):
185 response = requests.get(url, headers=headers, stream=True)
186 if response.status_code == 200:
187 filename = f"{str(ad_id)}.mp4"
188 with open(filename, "wb") as f:
189 for chunk in response.iter_content(chunk_size=1024):
190 if chunk:
191 f.write(chunk)
192 actor.log.info(f"{filename} downloaded successfully.")
193 else:
194 actor.log.info(f"Failed to download video from {url}. Status code: {response.status_code}")
195
196
197if __name__ == "__main__":
198 asyncio.run(main())
199
200 # Example usage:
201 # url = 'https://library.tiktok.com/api/v1/cdn/1712151504/video/aHR0cHM6Ly92NzcudGlrdG9rY2RuLmNvbS8zZjJiOWU5YmNhOGRlMGJjZjA3YmIwYWRiN2E3ZjE4Yi82NjBkYjAzOS92aWRlby90b3MvdXNlYXN0MmEvdG9zLXVzZWFzdDJhLXZlLTAwNjgtZXV0dHAvb01JRUtGbWVzQkM2RFVEOUVmeFJRUFFRRVd3aWdDaWRzNEpBUnQv/7cd4caab-916d-44c8-b3b9-34c155a810e1?a=475769&bti=PDU2NmYwMy86&ch=0&cr=0&dr=1&cd=0%7C0%7C0%7C0&cv=1&br=3876&bt=1938&cs=0&ds=6&ft=.NpOcInz7Th4FkErXq8Zmo&mime_type=video_mp4&qs=0&rc=aGg7ZmY3NTdnNDxnOzNoZ0BpM3RsO2s5cnFmcjMzZjczM0BjLmJiLTUxXjQxNDUxXy9hYSNkbWZjMmRjMTJgLS1kMWNzcw%3D%3D&vvpl=1&l=20240403133823D8EDE6F40EAE406831A7&btag=e00088000&cc=13'
202 # ad_id = 123
203 # download_video(url, ad_id)

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
.venv
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.gitignore

# This file tells Git which files shouldn't be added to source control
.idea
.DS_Store
apify_storage
storage
.venv/
.env/
__pypackages__
dist/
build/
*.egg-info/
*.egg
__pycache__
.mypy_cache
.dmypy.json
dmypy.json
.pytest_cache
.ruff_cache
.scrapy
*.log

requirements.txt

1beautifulsoup4
2pandas
3selenium
4requests