Tiktok ads scraper
Deprecated
Pricing
Pay per usage
Go to Store
Tiktok ads scraper
Deprecated
0.0 (0)
Pricing
Pay per usage
1
Total users
2
Monthly users
2
Last modified
a year ago
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python-selenium:3.11
# Second, copy just requirements.txt into the Actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.RUN python3 -m compileall -q .
# Specify how to launch the source code of your Actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "TiktokAdsScrapper", "title": "TikTok Ads Scraper with Selenium", "description": "Scrapes TikTok ads information and displays the data.", "version": "0.0", "meta": { "templateId": "python-selenium" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile", "storages": { "dataset": { "actorSpecification": 1, "fields" : {}, "views": { "tiktok-ads-data": { "title": "tiktok-ads-data", "transformation": { "fields": [ "ad_id", "ad_advertiser", "first_shown", "last_shown", "unique_user_views", "target_audience", "country_list", "gender", "age", "additional_parameters", "video_link" ] }, "display": { "component": "table", "properties": { "ad_id": { "label": "Ad ID", "format": "text" }, "ad_advertiser": { "label": "Advertiser", "format": "text" }, "first_shown": { "label": "First Shown", "format": "text" }, "last_shown": { "label": "Last Shown", "format": "text" }, "unique_user_views": { "label": "Unique User Views", "format": "text" }, "target_audience": { "label": "Target Audience", "format": "text" }, "country_list": { "label": "Country List", "format": "text" }, "gender": { "label": "Gender Distribution", "format": "text" }, "age": { "label": "Age Distribution", "format": "text" }, "additional_parameters": { "label": "Additional Parameters", "format": "text" }, "video_link": { "label": "Video Link", "format": "link" } } } } } } }}
.actor/input_schema.json
{ "title": "Python Selenium Scraper", "type": "object", "schemaVersion": 1, "properties": { "start_urls": { "title": "Start URLs", "type": "array", "description": "URLs to start with", "prefill": [ { "url": "https://apify.com" } ], "editor": "requestListSources" }, "max_depth": { "title": "Maximum depth", "type": "integer", "description": "Depth to which to scrape to", "default": 1 } }, "required": ["start_urls"]}
src/__main__.py
1"""2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging3settings. The `main()` coroutine is then executed using `asyncio.run()`.4
5Feel free to modify this file to suit your specific needs.6"""7
8import asyncio9import logging10
11from apify.log import ActorLogFormatter12
13from .main import main14
15# Configure loggers16handler = logging.StreamHandler()17handler.setFormatter(ActorLogFormatter())18
19apify_client_logger = logging.getLogger('apify_client')20apify_client_logger.setLevel(logging.INFO)21apify_client_logger.addHandler(handler)22
23apify_logger = logging.getLogger('apify')24apify_logger.setLevel(logging.DEBUG)25apify_logger.addHandler(handler)26
27# Execute the Actor main coroutine28asyncio.run(main())
src/main.py
1import requests2import csv3import time4import logging5from bs4 import BeautifulSoup6from selenium import webdriver7from selenium.webdriver.common.by import By8from selenium.webdriver.support.ui import WebDriverWait9from selenium.webdriver.support import expected_conditions as EC10from selenium.common.exceptions import TimeoutException, NoSuchElementException11from apify import Actor12
13
14async def main():15 async with Actor() as actor:16 input_data = await actor.get_input() or {}17 start_urls = input_data.get('start_urls', [])18 max_depth = input_data.get('max_depth', 1)19
20 dataset = await actor.open_dataset(name='tiktok-ads-data')21
22 for url_obj in start_urls:23 start_url = url_obj.get('url')24 if not start_url:25 continue26 # start_url="https://library.tiktok.com/ads?region=FR&start_time=1712082600000&end_time=1712169000000&adv_name=fashion&adv_biz_ids=&query_type=1&sort_type=last_shown_date,desc"27
28 chrome_options = webdriver.ChromeOptions()29 chrome_options.add_argument('--headless')30 chrome_options.add_argument('--no-sandbox')31 chrome_options.add_argument('--disable-dev-shm-usage')32
33
34 # Instantiate WebDriver35 driver = webdriver.Chrome(chrome_options)36 driver.get(start_url)37
38 try:39 driver.get(start_url)40 wait = WebDriverWait(driver, 10) # Creating WebDriverWait instance41 while True:42 # Try clicking the "View More" button43 view_more_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".loading_more")))44 driver.execute_script("arguments[0].scrollIntoView();", view_more_button) # Scroll to the button45 driver.execute_script("window.scrollBy(0, -100);")46 driver.execute_script("arguments[0].click();", view_more_button)47 # Optional: Wait for the content to load48 WebDriverWait(driver, 2).until(lambda d: d.find_element(By.CSS_SELECTOR, ".ad_card"))49
50 except (TimeoutException, NoSuchElementException):51 actor.log.info("All content loaded or button not found.")52
53 html= driver.page_source54 driver.quit()55
56 soup = BeautifulSoup(html, 'html.parser')57 ad_links = soup.find_all('a', class_='link')58 # print(ad_links)59
60 ad_ids = [link['href'].split('=')[-1] for link in ad_links]61 # print(ad_ids)62
63
64
65 base_url = 'https://library.tiktok.com/ads/detail/?ad_id='66 headers = {67 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'68 }69 70 for ad_id in ad_ids:71 ad_url = base_url + ad_id72 driver = webdriver.Chrome(chrome_options)73 driver.get(ad_url)74
75 time.sleep(2)76 ad_html = driver.page_source77 ad_soup = BeautifulSoup(ad_html, 'html.parser')78 details_tags = ad_soup.find_all('span', {'class': 'item_value'})79
80 advertiser = ad_soup.find('div', {'class': 'ad_advertiser_value'})81 if advertiser is None:82 advertiser = f"no advertiser available for ad_id: {ad_id}"83 else:84 advertiser = advertiser.text85
86 video_link_tag = ad_soup.find('video')87 if video_link_tag is None:88 video_link = f"no video available for ad_id: {ad_id}"89 else:90 video_link = video_link_tag['src']91
92 target_audience = ad_soup.find('span', {'class': 'ad_target_audience_size_value'})93 if target_audience is None:94 target_audience = f"no views available for ad_id: {ad_id}"95 else:96 target_audience = target_audience.text97
98 details_list = []99 for detail in details_tags:100 details_list.append(detail.text)101
102 rows = ad_soup.find_all('tbody', class_='byted-Table-Body')103
104 gender = []105 age = []106 country_list = []107 addn_parameters = []108
109 countries = rows[0].find_all('tr')110 for c in countries:111 cells = c.find_all('td')112 country = cells[1].text.strip()113 country_list.append(country)114 male, female, unknown = True, True, True115 if cells[2].find('path')['d'] == "M6 23a1 1 0 0 1 1-1h34a1 1 0 0 1 1 1v2a1 1 0 0 1-1 1H7a1 1 0 0 1-1-1v-2Z":116 male = False117 if cells[3].find('path')['d'] == "M6 23a1 1 0 0 1 1-1h34a1 1 0 0 1 1 1v2a1 1 0 0 1-1 1H7a1 1 0 0 1-1-1v-2Z":118 female = False119 if cells[4].find('path')['d'] == "M6 23a1 1 0 0 1 1-1h34a1 1 0 0 1 1 1v2a1 1 0 0 1-1 1H7a1 1 0 0 1-1-1v-2Z":120 unknown = False121
122 entry = {123 'country': country,124 'gender': {125 'Male': male,126 'Female': female,127 'Unknown': unknown128 }129 }130 gender.append(entry)131
132 countries = rows[1].find_all('tr')133 for c in countries:134 cells = c.find_all('td')135 country = cells[1].text.strip()136 ages = [True] * 6137 for i in range(6):138 if cells[2 + i].find('path')['d'] == "M6 23a1 1 0 0 1 1-1h34a1 1 0 0 1 1 1v2a1 1 0 0 1-1 1H7a1 1 0 0 1-1-1v-2Z":139 ages[i] = False140
141 entry = {142 'country': country,143 'ages': {144 '13-17': ages[0],145 '18-24': ages[1],146 '25-34': ages[2],147 '35-44': ages[3],148 '45-54': ages[4],149 '55+': ages[5],150 }151 }152 age.append(entry)153
154 param_rows = ad_soup.find_all('tr', class_="targeting_additional_parameters_table_row")155
156 entry = {}157 for p in param_rows:158 param = p.find('td', class_="targeting_additional_parameters_table_first_col")159 status = p.find('td', class_='')160 if status is not None:161 entry[param.text] = status.text162 else:163 entry[param.text] = 'None'164
165 addn_parameters.append(entry)166 await dataset.push_data({'ad_id':ad_id,'ad_advertiser': advertiser, 'first_shown':details_list[0], 'last_shown':details_list[1], 'unique_user_views':details_list[2], 'target_audience':target_audience,167 'country_list':country_list,'gender': gender, 'age':age, 'additional_parameters':addn_parameters, 'video_link':video_link})168 169 driver.quit()170
171
172 # Disable urllib3 warnings173
174 # Export the data as CSV and JSON175 await dataset.export_to_csv('data.csv', to_key_value_store_name='my-key-value-store')176
177 # Print the exported records178 store = await actor.open_key_value_store(name='my-key-value-store')179 print(await store.get_value('data.csv'))180 # print(csv_data)181 actor.log.info(f"Ad IDs: {ad_ids}")182
183
184def download_video(url, ad_id):185 response = requests.get(url, headers=headers, stream=True)186 if response.status_code == 200:187 filename = f"{str(ad_id)}.mp4"188 with open(filename, "wb") as f:189 for chunk in response.iter_content(chunk_size=1024):190 if chunk:191 f.write(chunk)192 actor.log.info(f"{filename} downloaded successfully.")193 else:194 actor.log.info(f"Failed to download video from {url}. Status code: {response.status_code}")195
196
197if __name__ == "__main__":198 asyncio.run(main())199
200 # Example usage:201 # url = 'https://library.tiktok.com/api/v1/cdn/1712151504/video/aHR0cHM6Ly92NzcudGlrdG9rY2RuLmNvbS8zZjJiOWU5YmNhOGRlMGJjZjA3YmIwYWRiN2E3ZjE4Yi82NjBkYjAzOS92aWRlby90b3MvdXNlYXN0MmEvdG9zLXVzZWFzdDJhLXZlLTAwNjgtZXV0dHAvb01JRUtGbWVzQkM2RFVEOUVmeFJRUFFRRVd3aWdDaWRzNEpBUnQv/7cd4caab-916d-44c8-b3b9-34c155a810e1?a=475769&bti=PDU2NmYwMy86&ch=0&cr=0&dr=1&cd=0%7C0%7C0%7C0&cv=1&br=3876&bt=1938&cs=0&ds=6&ft=.NpOcInz7Th4FkErXq8Zmo&mime_type=video_mp4&qs=0&rc=aGg7ZmY3NTdnNDxnOzNoZ0BpM3RsO2s5cnFmcjMzZjczM0BjLmJiLTUxXjQxNDUxXy9hYSNkbWZjMmRjMTJgLS1kMWNzcw%3D%3D&vvpl=1&l=20240403133823D8EDE6F40EAE406831A7&btag=e00088000&cc=13'202 # ad_id = 123203 # download_video(url, ad_id)
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed files.venv
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.gitignore
# This file tells Git which files shouldn't be added to source control
.idea.DS_Store
apify_storagestorage
.venv/.env/__pypackages__dist/build/*.egg-info/*.egg
__pycache__
.mypy_cache.dmypy.jsondmypy.json.pytest_cache.ruff_cache
.scrapy*.log
requirements.txt
1beautifulsoup42pandas3selenium4requests