Street Fighter 6 CFN Scraper
Try for free
No credit card required
Go to Store
Street Fighter 6 CFN Scraper
3ternal/street-fighter-6-cfn-scraper
Try for free
No credit card required
SF6 scraper for the Capcom Fighters Network (Buckler's Boot Camp)
.actor/Dockerfile
1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python-playwright
5
6# Second, copy just requirements.txt into the Actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Use compileall to ensure the runnability of the Actor Python code.
29RUN python3 -m compileall -q .
30
31# Specify how to launch the source code of your Actor.
32# By default, the "python3 -m src" command is run
33CMD ["python3", "-m", "src"]
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "street-fighter-6-cfn-scraper",
4 "title": "Street Fighter 6 CFN Scraper",
5 "description": "SF6 scraper for the Capcom Fighters Network (Buckler's Boot Camp)",
6 "version": "1.0",
7 "meta": {
8 "templateId": "python-beautifulsoup"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile",
12 "storages": {
13 "dataset": {
14 "actorSpecification": 1,
15 "title": "URLs and their titles",
16 "views": {
17 "titles": {
18 "title": "URLs and their titles",
19 "transformation": {
20 "fields": [
21 "url",
22 "title"
23 ]
24 },
25 "display": {
26 "component": "table",
27 "properties": {
28 "url": {
29 "label": "URL",
30 "format": "text"
31 },
32 "title": {
33 "label": "Title",
34 "format": "text"
35 }
36 }
37 }
38 }
39 }
40 }
41 }
42}
.actor/input_schema.json
1{
2 "title": "Street Fighter 6 CFN Scraper",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "email": {
7 "title": "CFN Email",
8 "type": "string",
9 "description": "Your CFN email address",
10 "editor": "textfield"
11 },
12 "password": {
13 "title": "CFN Password",
14 "type": "string",
15 "description": "Your CFN password",
16 "editor": "textfield"
17 },
18 "rank_to_search": {
19 "title": "Only Search for Specific Rank (Index)",
20 "type": "integer",
21 "description": "Finds the starting point of a specific rank (0 is Master, 35 is Rookie 1). If blank, the scraper will start from Master and attempt to find all of the ranks. However, the site will usually block you after a few tries, so it might be better to search for a specific rank instead.",
22 "editor": "number"
23 },
24 "initial_page_jump": {
25 "title": "Initial Page Jump",
26 "type": "integer",
27 "description": "How many pages should we skip per attempt? Leave blank unless your page to start searching is very close to the target page.",
28 "editor": "number"
29 },
30 "start_page": {
31 "title": "Start Searching at This Page",
32 "type": "integer",
33 "description": "Start searching for target rank on this page (will speed up execution if accurate). If blank, we'll use our predetermined page numbers (accurate as of March 2024).",
34 "editor": "number"
35 }
36 },
37 "required": ["email", "password"]
38}
src/__main__.py
1"""
2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging
3settings. The `main()` coroutine is then executed using `asyncio.run()`.
4
5Feel free to modify this file to suit your specific needs.
6"""
7
8import asyncio
9import logging
10
11from apify.log import ActorLogFormatter
12
13from .main import main
14
15# Configure loggers
16handler = logging.StreamHandler()
17handler.setFormatter(ActorLogFormatter())
18
19apify_client_logger = logging.getLogger('apify_client')
20apify_client_logger.setLevel(logging.INFO)
21apify_client_logger.addHandler(handler)
22
23apify_logger = logging.getLogger('apify')
24apify_logger.setLevel(logging.DEBUG)
25apify_logger.addHandler(handler)
26
27# Execute the Actor main coroutine
28asyncio.run(main())
src/main.py
1from urllib.parse import urljoin
2from apify import Actor
3from apify.log import ActorLogFormatter
4import logging
5from playwright.async_api import async_playwright, Page, expect
6from bs4 import BeautifulSoup
7import math
8
9browser = None
10context = None
11page = None
12readable_content = None
13
14was_above_target = None
15iterations = 0
16page_jump = 1000
17user_dict = {}
18current_page_int = 1
19current_target_lp = 0
20
21email = ""
22password = ""
23start_pages = []
24search_only_this_rank = 0
25initial_page_jump = 0
26
27base_start_url = "https://www.streetfighter.com/6/buckler/ranking/league?character_filter=2&character_id=luke&platform=1&user_status=1&home_filter=1&home_category_id=0&home_id=1&league_rank=0&page="
28
29target_lp_per_rank = [
30 25000, # Master
31 23800, 22600, 21400, 20200, 19000, # Diamond
32 17800, 16600, 15400, 14200, 13000, # Platinum
33 12200, 11400, 10600, 9800, 9000, # Gold
34 8200, 7400, 6600, 5800, 5000, # Silver
35 4600, 4200, 3800, 3400, 3000, # Bronze
36 2600, 2200, 1800, 1400, 1000, # Iron
37 800, 600, 400, 200, 0 # Rookie
38 ]
39
40ranks = [
41 "Master",
42 "Diamond 5", "Diamond 4", "Diamond 3", "Diamond 2", "Diamond 1",
43 "Platinum 5", "Platinum 4", "Platinum 3", "Platinum 2", "Platinum 1",
44 "Gold 5", "Gold 4", "Gold 3", "Gold 2", "Gold 1",
45 "Silver 5", "Silver 4", "Silver 3", "Silver 2", "Silver 1",
46 "Bronze 5", "Bronze 4", "Bronze 3", "Bronze 2", "Bronze 1",
47 "Iron 5", "Iron 4", "Iron 3", "Iron 2", "Iron 1",
48 "Rookie 5", "Rookie 4", "Rookie 3", "Rookie 2", "Rookie 1",
49]
50
51estimated_start_pages = [
52 6102, #master
53 6462, 7248, 8300, 9637, 12306, #diamond
54 13994, 15815, 18101, 20841, 26407, #platinum
55 28248, 29682, 31164, 32676, 35215, #gold
56 36138, 37770, 39772, 41749, 44936, #silver
57 45672, 47114, 48594, 49774, 51799, #bronze
58 52580, 54017, 55291, 56674, 59273, #iron
59 59795, 60333, 60962, 61552, 61552 #rookie
60]
61
62placement_of_users_per_rank = []
63players_in_each_rank = []
64
65async def main():
66 async with Actor:
67 # get global variables from the user's input
68 actor_input = await Actor.get_input() or {}
69 GetInfoFromActorInput(actor_input)
70
71 # set up the Playwright browser and begin the search
72 async with async_playwright() as playwright:
73 await DoSearch(playwright)
74
75def GetInfoFromActorInput(actor_input):
76 global email
77 global password
78 global start_pages
79 global search_only_this_rank
80
81 email = actor_input.get('email')
82 password = actor_input.get('password')
83
84 start_pages = estimated_start_pages
85 start_page_override = actor_input.get('start_page')
86
87 search_only_this_rank = actor_input.get('rank_to_search')
88 initial_page_jump = actor_input.get('initial_page_jump')
89
90 if search_only_this_rank is not None and start_page_override is not None:
91 start_pages[search_only_this_rank] = start_page_override
92
93async def DoSearch(playwright):
94 global placement_of_users_per_rank
95 global players_in_each_rank
96
97 # create the browser and log in
98 await SetUp(playwright)
99
100 # get the total number of players
101 total_players_str = await page.locator("span[class='ranking_ranking_now__last__oqSXS']").last.text_content()
102 total_players_str = total_players_str[1:]
103 total_players_str = total_players_str.strip()
104 total_players_int = int(total_players_str)
105
106 # we might want to only search for one rank at a time, because the site will kick us out after we load a few pages
107 if (search_only_this_rank is not None):
108 start_index = search_only_this_rank
109 end_index = search_only_this_rank + 1
110 #otherwise, we'll attempt to search all of them
111 else:
112 start_index = 0
113 end_index = len(ranks)
114
115 print("Start rank index: " + str(start_index) + "\nEnd rank index: " + str(end_index - 1))
116
117 # find the starting point of each rank in our list
118 for i in range(start_index, end_index):
119 target_lp = target_lp_per_rank[i]
120 start_page = start_pages[i]
121 rank_name = ranks[i]
122 print("Searching for " + rank_name + "...")
123
124 #find where the rank begins
125 placement_of_first_user = await FindPlacingOfFirstUserInRank(playwright, start_page, target_lp)
126 print("\n" + rank_name + " begins at #" + str(placement_of_first_user) + "\nThere are " + str(total_players_int) + " players in total\n")
127
128 #and add it to a list
129 placement_of_users_per_rank.append(placement_of_first_user)
130
131 # this part is only relevant if we somehow managed to get a list of all ranks in a single run of this Actor, which is unlikely
132 if len(placement_of_users_per_rank) > 1:
133 for i in range(len(placement_of_users_per_rank)):
134 rank_name = ranks[i]
135 current_rank_starts_at = placement_of_users_per_rank[i]
136 prev_rank_starts_at = placement_of_users_per_rank[i - 1] if i > 0 else 0
137 players_in_rank = current_rank_starts_at - prev_rank_starts_at
138
139 percentage = GetPercentageString(players_in_rank, total_players_int)
140 print(rank_name + " contains " + str(players_in_rank) + " players\nIt represents " + percentage + " of the playerbase")
141
142 players_in_each_rank.append(players_in_rank)
143
144 print("Placements of all users: " + str(placement_of_users_per_rank))
145 print("Players in each rank: " + str(players_in_each_rank))
146
147def GetPercentageString(players_in_rank, total_players):
148 percentage_int = (players_in_rank / total_players) * 100
149 return str(percentage_int) + "%"
150
151async def SetUp(playwright):
152 """This will launch the browser and log you into CFN. Call this first, and only call it once."""
153 # create the Playwright browser
154 global browser
155 global context
156 global page
157 global iterations
158 global page_jump
159
160 browser = await playwright.firefox.launch(headless = Actor.config.headless)
161 context = await browser.new_context()
162 page = await context.new_page();
163 login_page = "https://www.streetfighter.com/6/buckler/auth/loginep?redirect_url=/?status=login"
164
165 # go to the login page
166 await page.goto(login_page);
167 await GetPageHtml()
168
169 # fill out the age check dropdown
170 await InputAgeCheck()
171 await GetPageHtml()
172
173 # uncomment this to check if cloudflare is blocking you for logging in too many times
174 # print(readable_content)
175
176 # log in
177 await LogIn()
178 await GetPageHtml()
179
180 # uncomment this to check if cloudflare is blocking you for logging in too many times
181 #print(readable_content)
182
183 # it'll be easier if we just start on the ranking page
184 start_url = base_start_url + str(1)
185 await page.goto(start_url, timeout=60000);
186
187async def FindPlacingOfFirstUserInRank(playwright, start_page, target_lp):
188 global current_target_lp
189 global page_jump
190
191 current_target_lp = target_lp
192
193 if (start_page is None):
194 Actor.log.error('Start page is null!')
195 return
196
197 # go to the ranking page
198 start_url = base_start_url + str(start_page)
199 await page.goto(start_url, timeout=60000);
200
201 print("Start URL: " + page.url)
202 #await GetPageHtml()
203 #print(readable_content)
204
205 # figure out what page we're on
206 pagination = page.locator("div[class='ranking_pc__LlGv4']").locator("div[class='ranking_ranking_pager__top__etBHR']").locator("ul[class='pagination']").first
207 await expect(pagination).to_be_visible(timeout=30000)
208
209 page_jump = initial_page_jump or 1000
210 iterations = 0
211 highest_user_in_last_rank = None
212
213 # loop the search function
214 while True:
215 # each time this is called, we'll navigate to a new page
216 highest_user_in_last_rank = await SearchForBeginningOfRank(pagination)
217
218 # if we reached our goal, then break
219 if highest_user_in_last_rank is not None:
220 break
221
222 # just in case something went wrong, we should prevent the loop from running infinitely
223 iterations += 1
224 if iterations > 30:
225 break
226
227 placement_str = await highest_user_in_last_rank.locator("dt").text_content()
228 placement_str = placement_str.strip()
229 placement_str = placement_str[1:]
230 placement_int = int(placement_str)
231
232 lp_str = await highest_user_in_last_rank.locator("dd").text_content()
233 lp_str = lp_str[:-3]
234 lp_int = int(lp_str)
235
236 username = await highest_user_in_last_rank.locator("span[class='ranking_name__El29_']").text_content()
237
238 # print the results
239 print("\nHighest ranked user in previous rank: " + str(username))
240 print("LP: " + lp_str + "\nPosition: " + placement_str + "\nPage: " + str(current_page_int) + "\nURL: " + page.url)
241
242 # placement_int refers to the highest-placed user in the prev rank
243 # so we'll add 1 to give us the first user in the current rank
244 return placement_int + 1
245
246async def SearchForBeginningOfRank(pagination):
247 """Each time this method is called, we'll load a page and attempt to find the highest-ranked user in the previous rank. This would tell us where the current rank begins.
248 If the loaded page doesn't contain the highest ranked user, nothing will be returned, and you'll have to call this again"""
249 global was_above_target
250 global page_jump
251 global current_page_int
252
253 # get info about current and next page
254 current_page = pagination.locator("xpath=/li[@class='active']").first
255 await expect(current_page).to_be_visible()
256
257 current_page_text = await current_page.text_content()
258 current_page_str = str(current_page_text)
259 current_page_int = int(current_page_str)
260 print("\nCurrent page: " + current_page_str)
261
262 # first, we need to find the ranking page
263 ranking_list = page.locator("xpath=//ul[@class='ranking_ranking_list__szajj']").first
264 await expect(ranking_list).to_be_visible()
265
266 # find the last user's lp
267 all_users_on_page = await ranking_list.locator("xpath=/li").all()
268 last_user = all_users_on_page[len(all_users_on_page) - 1]
269 await expect(last_user).to_be_visible()
270
271 last_user_lp_str = await last_user.locator("dd").text_content()
272 last_user_lp_str = last_user_lp_str[:-3]
273 last_user_lp_int = int(last_user_lp_str)
274 print("LP of the last user on the page: " + str(last_user_lp_int) + "\nWe're looking for " + str(current_target_lp))
275
276 # we're trying to roughly find the last page of the lower rank
277 highest_lp_in_last_rank = current_target_lp - 1
278
279 # if we're above the target, then count downwards
280 if last_user_lp_int > highest_lp_in_last_rank:
281 #every time we overshoot, we'll halve the size of the jump
282 if was_above_target == True:
283 page_jump = math.floor(page_jump / 2)
284
285 # if we're getting close, we'll need a much lower page jump
286 if last_user_lp_int == current_target_lp:
287 if abs(page_jump) > 50:
288 page_jump = 50
289
290 page_jump = abs(page_jump)
291 print("Page jump: " + str(page_jump))
292
293 was_above_target = False
294
295 # if we're below the target (most likely if we overshot), then count upwards
296 elif last_user_lp_int < highest_lp_in_last_rank - 20:
297 #every time we overshoot, we'll halve the size of the jump
298 if was_above_target == False:
299 page_jump = math.floor(page_jump / 2)
300
301 page_jump = -abs(page_jump)
302 print("Page jump: " + str(page_jump))
303
304 was_above_target = True
305
306 # once you've found the new rank, we have no choice but to iterate slowly and find where the old rank ends
307 else:
308 print("We're very close to our target! It's time to start incrementing one page at a time")
309
310 # get all lp on page
311 lp_list = await GetAllLpOnPage(ranking_list)
312
313 # basically, we want to find the first person with the LP of the target rank
314 if not current_target_lp in lp_list:
315 print("We overshot a little, so we'll have to move backward one page at a time to find the first user with an LP of " + str(current_target_lp))
316 page_jump = -1
317
318 # if we found someone at current_target_lp, then I think that's it?
319 else:
320 target_index = -1
321
322 for i in range(len(lp_list)):
323 if lp_list[i] < current_target_lp:
324 target_index = i
325 break
326
327 # we've found our target, so let's return the user and let the main method take control
328 return all_users_on_page[target_index]
329
330 # figure out the name of the URL to move to
331
332 target_page = current_page_int + page_jump
333 print("Target page: " + str(target_page))
334
335 target_url = GetURLForPage(target_page)
336
337 await page.goto(target_url, timeout=180000)
338 await page.wait_for_url(target_url)
339 await page.content()
340
341def GetURLForPage(new_page_int):
342 string_to_replace = "page=" + str(current_page_int)
343 target_page = current_page_int + page_jump
344
345 current_url = page.url
346 return current_url.replace(string_to_replace, "page=" + str(target_page))
347
348async def GetAllLpOnPage(ranking_list):
349 children = await ranking_list.locator("xpath=/li").all()
350 output = []
351 # iterate over each <li>
352 count = 0
353 for i in range(len(children)):
354 userLi = children[i]
355
356 # get the lp
357 lpRaw = await userLi.locator("dd").text_content()
358 lpStr = str(lpRaw)
359 lpStr = lpStr[:-3]
360 lpInt = int(lpStr)
361
362 # add the user to a dictionary
363 output.append(lpInt)
364
365 print("All LP on page: " + str(output))
366 return output
367
368async def GetPageHtml():
369 """We don't really need this, but it's useful for debugging"""
370 html = await page.content()
371 soup = BeautifulSoup(html, features = "html.parser")
372
373 global readable_content
374 readable_content = soup.prettify()
375
376async def InputAgeCheck():
377 """Fills out the age check dropdown"""
378 # locate the dropdown (instant), then await our selection of the dropdown item
379 dropdown = page.locator("select[id='country']")
380 await dropdown.select_option("Canada")
381
382 # if we've made it this far without hitting an error, we can go ahead and fill out the other options
383 await page.locator("select[id='birthYear']").select_option('1992')
384 await page.locator("select[id='birthMonth']").select_option('1')
385 await page.locator("select[id='birthDay']").select_option('15')
386
387 # press submit
388 await page.locator("button[name='submit']").click()
389
390 # wait for the new page to load
391 await page.wait_for_timeout(3000)
392
393 print("Passed age check!\n")
394
395async def LogIn():
396 # fill out email and pw
397 email_field = page.locator("input[type='email']")
398 await email_field.fill(email)
399
400 pw_field = page.locator("input[type='password']")
401 await pw_field.fill(password)
402
403 # press submit
404 await page.locator("button[name='submit']").click()
405
406 # wait for the new page to load
407 await page.wait_for_timeout(10000)
408
409 print("Logged in!\n")
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10.venv
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.DS_Store
5
6apify_storage
7storage
8
9.venv/
10.env/
11__pypackages__
12dist/
13build/
14*.egg-info/
15*.egg
16
17__pycache__
18
19.mypy_cache
20.dmypy.json
21dmypy.json
22.pytest_cache
23.ruff_cache
24
25.scrapy
26*.log
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify ~= 1.6.0
5beautifulsoup4 ~= 4.12.2
6httpx ~= 0.25.2
7types-beautifulsoup4 ~= 4.12.0.7