1import requests
2import csv
3import time
4import logging
5from bs4 import BeautifulSoup
6from selenium import webdriver
7from selenium.webdriver.common.by import By
8from selenium.webdriver.support.ui import WebDriverWait
9from selenium.webdriver.support import expected_conditions as EC
10from selenium.common.exceptions import TimeoutException, NoSuchElementException
11from apify import Actor
12
13
14async def main():
15 async with Actor() as actor:
16 input_data = await actor.get_input() or {}
17 start_urls = input_data.get('start_urls', [])
18 max_depth = input_data.get('max_depth', 1)
19
20 dataset = await actor.open_dataset(name='tiktok-ads-data')
21
22 for url_obj in start_urls:
23 start_url = url_obj.get('url')
24 if not start_url:
25 continue
26
27
28 chrome_options = webdriver.ChromeOptions()
29 chrome_options.add_argument('--headless')
30 chrome_options.add_argument('--no-sandbox')
31 chrome_options.add_argument('--disable-dev-shm-usage')
32
33
34
35 driver = webdriver.Chrome(chrome_options)
36 driver.get(start_url)
37
38 try:
39 driver.get(start_url)
40 wait = WebDriverWait(driver, 10)
41 while True:
42
43 view_more_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".loading_more")))
44 driver.execute_script("arguments[0].scrollIntoView();", view_more_button)
45 driver.execute_script("window.scrollBy(0, -100);")
46 driver.execute_script("arguments[0].click();", view_more_button)
47
48 WebDriverWait(driver, 2).until(lambda d: d.find_element(By.CSS_SELECTOR, ".ad_card"))
49
50 except (TimeoutException, NoSuchElementException):
51 actor.log.info("All content loaded or button not found.")
52
53 html= driver.page_source
54 driver.quit()
55
56 soup = BeautifulSoup(html, 'html.parser')
57 ad_links = soup.find_all('a', class_='link')
58
59
60 ad_ids = [link['href'].split('=')[-1] for link in ad_links]
61
62
63
64
65 base_url = 'https://library.tiktok.com/ads/detail/?ad_id='
66 headers = {
67 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
68 }
69
70 for ad_id in ad_ids:
71 ad_url = base_url + ad_id
72 driver = webdriver.Chrome(chrome_options)
73 driver.get(ad_url)
74
75 time.sleep(2)
76 ad_html = driver.page_source
77 ad_soup = BeautifulSoup(ad_html, 'html.parser')
78 details_tags = ad_soup.find_all('span', {'class': 'item_value'})
79
80 advertiser = ad_soup.find('div', {'class': 'ad_advertiser_value'})
81 if advertiser is None:
82 advertiser = f"no advertiser available for ad_id: {ad_id}"
83 else:
84 advertiser = advertiser.text
85
86 video_link_tag = ad_soup.find('video')
87 if video_link_tag is None:
88 video_link = f"no video available for ad_id: {ad_id}"
89 else:
90 video_link = video_link_tag['src']
91
92 target_audience = ad_soup.find('span', {'class': 'ad_target_audience_size_value'})
93 if target_audience is None:
94 target_audience = f"no views available for ad_id: {ad_id}"
95 else:
96 target_audience = target_audience.text
97
98 details_list = []
99 for detail in details_tags:
100 details_list.append(detail.text)
101
102 rows = ad_soup.find_all('tbody', class_='byted-Table-Body')
103
104 gender = []
105 age = []
106 country_list = []
107 addn_parameters = []
108
109 countries = rows[0].find_all('tr')
110 for c in countries:
111 cells = c.find_all('td')
112 country = cells[1].text.strip()
113 country_list.append(country)
114 male, female, unknown = True, True, True
115 if cells[2].find('path')['d'] == "M6 23a1 1 0 0 1 1-1h34a1 1 0 0 1 1 1v2a1 1 0 0 1-1 1H7a1 1 0 0 1-1-1v-2Z":
116 male = False
117 if cells[3].find('path')['d'] == "M6 23a1 1 0 0 1 1-1h34a1 1 0 0 1 1 1v2a1 1 0 0 1-1 1H7a1 1 0 0 1-1-1v-2Z":
118 female = False
119 if cells[4].find('path')['d'] == "M6 23a1 1 0 0 1 1-1h34a1 1 0 0 1 1 1v2a1 1 0 0 1-1 1H7a1 1 0 0 1-1-1v-2Z":
120 unknown = False
121
122 entry = {
123 'country': country,
124 'gender': {
125 'Male': male,
126 'Female': female,
127 'Unknown': unknown
128 }
129 }
130 gender.append(entry)
131
132 countries = rows[1].find_all('tr')
133 for c in countries:
134 cells = c.find_all('td')
135 country = cells[1].text.strip()
136 ages = [True] * 6
137 for i in range(6):
138 if cells[2 + i].find('path')['d'] == "M6 23a1 1 0 0 1 1-1h34a1 1 0 0 1 1 1v2a1 1 0 0 1-1 1H7a1 1 0 0 1-1-1v-2Z":
139 ages[i] = False
140
141 entry = {
142 'country': country,
143 'ages': {
144 '13-17': ages[0],
145 '18-24': ages[1],
146 '25-34': ages[2],
147 '35-44': ages[3],
148 '45-54': ages[4],
149 '55+': ages[5],
150 }
151 }
152 age.append(entry)
153
154 param_rows = ad_soup.find_all('tr', class_="targeting_additional_parameters_table_row")
155
156 entry = {}
157 for p in param_rows:
158 param = p.find('td', class_="targeting_additional_parameters_table_first_col")
159 status = p.find('td', class_='')
160 if status is not None:
161 entry[param.text] = status.text
162 else:
163 entry[param.text] = 'None'
164
165 addn_parameters.append(entry)
166 await dataset.push_data({'ad_id':ad_id,'ad_advertiser': advertiser, 'first_shown':details_list[0], 'last_shown':details_list[1], 'unique_user_views':details_list[2], 'target_audience':target_audience,
167 'country_list':country_list,'gender': gender, 'age':age, 'additional_parameters':addn_parameters, 'video_link':video_link})
168
169 driver.quit()
170
171
172
173
174
175 await dataset.export_to_csv('data.csv', to_key_value_store_name='my-key-value-store')
176
177
178 store = await actor.open_key_value_store(name='my-key-value-store')
179 print(await store.get_value('data.csv'))
180
181 actor.log.info(f"Ad IDs: {ad_ids}")
182
183
184def download_video(url, ad_id):
185 response = requests.get(url, headers=headers, stream=True)
186 if response.status_code == 200:
187 filename = f"{str(ad_id)}.mp4"
188 with open(filename, "wb") as f:
189 for chunk in response.iter_content(chunk_size=1024):
190 if chunk:
191 f.write(chunk)
192 actor.log.info(f"{filename} downloaded successfully.")
193 else:
194 actor.log.info(f"Failed to download video from {url}. Status code: {response.status_code}")
195
196
197if __name__ == "__main__":
198 asyncio.run(main())
199
200
201
202
203