Spotify Albums Scraper avatar

Spotify Albums Scraper

Try for free

2 hours trial then $19.99/month - No credit card required now

Go to Store
Spotify Albums Scraper

Spotify Albums Scraper

easyapi/spotify-albums-scraper
Try for free

2 hours trial then $19.99/month - No credit card required now

Scrape Spotify albums by keywords. Extract comprehensive album data including artist details, cover art, release dates, and playability status. Perfect for music cataloging, album research, and industry analysis.

import { Actor } from 'apify'; import { PuppeteerCrawler } from 'crawlee'; import puppeteerExtra from 'puppeteer-extra'; import stealthPlugin from 'puppeteer-extra-plugin-stealth'; import Redis from "ioredis" import randomUseragent from 'random-useragent'; import fetch from 'node-fetch';

puppeteerExtra.use(stealthPlugin());

// const redisClient = new Redis("rediss://default:AY9xAAIjcDE2YTMyNjQyYTI5ODQ0NDQ3YjVhOTE1YWRiYWEyMTZjN3AxMA@sharing-wren-36721.upstash.io:6379"); // let sessionid = await redisClient.get('instagram-sessionid-cookie');

class SpotifyAlbumsScraper {

1async run(input) {
2    const { keywords, maxItems = Infinity } = input;
3    this.maxItems = maxItems; // 存储maxItems为类属性
4    for(let keyword of keywords) {
5        this.currentSearchUrl = keyword; // 存储当前处理的postUrl
6        const crawler = new PuppeteerCrawler({
7            launchContext: {
8                launcher: puppeteerExtra,
9                launchOptions: {
10                    headless: true,
11                    args: [
12                        '--no-sandbox',
13                        '--disable-setuid-sandbox',
14                        '--disable-dev-shm-usage',
15                        '--disable-accelerated-2d-canvas',
16                        '--no-first-run',
17                        '--no-zygote',
18                        '--disable-gpu',
19                        '--disable-geolocation',  // 添加这一行
20                        '--disable-notifications',  // 添加这一行
21                        `--user-agent=${randomUseragent.getRandom(function (ua) {
22                            return ua.browserName === 'Chrome' && parseFloat(ua.browserVersion) >= 80;
23                        })}`,                        
24                    ],
25                    // Other Puppeteer options
26                },
27            },
28            //maxRequestsPerCrawl: maxItems,
29            requestHandlerTimeoutSecs: 3600, // 设置请求处理器超时时间为1小时
30            navigationTimeoutSecs: 300, // 5分钟
31            preNavigationHooks: [
32                async ({ page }) => {
33                    await this.setCookies(page);
34                },
35            ],
36            requestHandler: async ({ page, request }) => {
37                console.log(`Processing ${request.url}...`);
38                let count = 0;
39                let endOfResults = false;
40                let interceptedData = []; // 初始化局部变量
41                let lastDataLength = 0;
42                let noNewDataCount = 0;
43                const MAX_NO_NEW_DATA_COUNT = 5; // 10秒 / 2秒 = 5次
44
45                await this.setupInterceptors(page, interceptedData);
46
47                await page.goto(request.url, { waitUntil: 'networkidle0' });
48
49                while (!endOfResults && interceptedData.length < maxItems) {
50                    count++;
51                    await new Promise(resolve => setTimeout(resolve, 2000));
52
53                    if (endOfResults) {
54                        console.log('Reached end of results');
55                    } else if (interceptedData.length === lastDataLength) {
56                        noNewDataCount++;
57                        // console.log(`No new data for ${noNewDataCount * 5} seconds`);
58                        if (noNewDataCount >= MAX_NO_NEW_DATA_COUNT) {
59                            // console.log('No new data for 20 seconds, assuming end of results');
60                            endOfResults = true;
61                        }
62                    } else {
63                        noNewDataCount = 0;
64                        lastDataLength = interceptedData.length;
65                    }
66                }
67
68                console.log(`Saved ${interceptedData.length} items`);
69            },
70        });
71
72        await crawler.run([`https://open.spotify.com/search/${keyword}/albums`]);
73    }
74}
75
76async setCookies(page) {
77    await page.setCookie(
78        {
79            name: 'sid_guard',
80            value: 'f917952be638f4225aa82251d42e8d5a%7C1729996200%7C15551986%7CFri%2C+25-Apr-2025+02%3A29%3A46+GMT',
81            domain: '.tiktok.com',
82            path: '/',
83            expires: Math.floor(new Date('2025-11-11').getTime() / 1000),
84            httpOnly: true,
85            secure: true
86        }
87    );
88}
89
90async setupInterceptors(page, interceptedData) {
91    await page.setRequestInterception(true);
92    page.on('request', (request) => request.continue());
93    page.on('response', async (response) => {
94        const request = response.request();
95        const url = request.url();
96        
97        try {
98            if (await this.shouldInterceptRequest(url, request)) {
99                let responseBody = await response.json();
100                await this.processResponseData(responseBody, interceptedData);
101                
102                if (interceptedData.length < this.maxItems) {
103                    await new Promise(resolve => setTimeout(resolve, 3000));
104                    
105                    // 解析当前offset
106                    let searchParams = new URL(url)?.searchParams;
107                    const variables = JSON.parse(searchParams.get('variables') || '{}');
108                    const currentOffset = parseInt(variables.offset || '0');
109                    const newOffset = currentOffset + 30;
110                    
111                    // 在浏览器环境中发起请求
112                    await page.evaluate(async ({ url, currentOffset, newOffset, headers }) => {
113                        const searchParams = new URL(url).searchParams;
114                        const variables = JSON.parse(searchParams.get('variables'));
115                        variables.offset = newOffset;
116                        
117                        const newSearchParams = new URLSearchParams(searchParams);
118                        newSearchParams.set('variables', JSON.stringify(variables));
119                        const newUrl = `${url.split('?')[0]}?${newSearchParams.toString()}`;
120                        
121                        // 在浏览器中发起请求
122                        await fetch(newUrl, {
123                            method: 'GET',
124                            headers: headers,
125                            credentials: 'include'
126                        });
127                    }, {
128                        url,
129                        currentOffset,
130                        newOffset,
131                        headers: request.headers()
132                    });    
133                }
134            }
135        } catch (error) {
136            console.error(`Error processing response:`, error);
137        }
138    });
139}
140
141async processResponseData(responseBody, interceptedData) {
142    let currentData = [];
143    responseBody?.data?.searchV2?.albumsV2?.items?.forEach(e => {
144        const data = e?.data;
145            if (!data) return;
146
147            const processedData = {
148                ...data,
149                albumUrl: 'https://open.spotify.com/albums/' + data.uri.replace('spotify:albums:', ''),
150                keyword: this.currentSearchUrl // 保留原有的关键词字段
151            };
152
153            if (interceptedData.length < this.maxItems) {
154                interceptedData.push(processedData);
155                currentData.push(processedData);
156            }
157    });
158
159    // 如果超过最大数量,截断currentData
160    if (interceptedData.length > this.maxItems) {
161        const overflow = interceptedData.length - this.maxItems;
162        currentData = currentData.slice(0, -overflow);
163        interceptedData = interceptedData.slice(0, this.maxItems);
164    }
165
166    if (currentData.length > 0) {
167        await Actor.pushData(currentData);
168        console.log(`Pushed ${currentData.length} items`);
169    }
170}
171
172async shouldInterceptRequest(url, request) {
173    if (url.includes('/pathfinder/v1/query') && request.method() === 'GET') {
174        try {
175            let searchParams = new URL(url)?.searchParams;
176            const operationName = searchParams?.get('operationName');
177            if (operationName === 'searchAlbums') {
178                return true;
179            }
180        } catch (error) {
181            console.error('Error parsing URL:', error);
182        }
183    }
184    return false;
185}
186
187
188async scrollPage(page) {
189    await page.evaluate(() => {
190        return new Promise((resolve) => {
191            
192            const presentationDiv = document.querySelector('[data-testid="track-list"] > div:nth-child(2)');
193            if (!presentationDiv) {
194                console.log('Presentation div not found');
195                resolve();
196                return;
197            }
198
199            const initialHeight = presentationDiv.style.height?.replace('px', '') || 1000;
200            const scrollIncrement = 1000; // 每次增加的高度,可根据需要调整
201
202            let currentHeight = initialHeight;
203            const scrollSteps = Math.floor(Math.random() * 3) + 3; // 随机3-5步
204            let scrollCount = 0;
205
206            const scrollInterval = setInterval(() => {
207                if (scrollCount >= scrollSteps) {
208                    clearInterval(scrollInterval);
209                    resolve();
210                    return;
211                }
212
213                // 增加高度以模拟滚动
214                currentHeight += scrollIncrement + (Math.random() * 200 - 100); // 添加一些随机性
215                presentationDiv.style.height = `${currentHeight}px`;
216
217                scrollCount++;
218            }, 500 + Math.random() * 500); // 随机间隔500-1000毫秒
219        });
220    });
221    
222}
223
224// 在滚动和其他操作之间添加随机延迟
225async randomDelay(min = 1000, max = 3000) {
226    const delay = Math.floor(Math.random() * (max - min + 1) + min);
227    await new Promise(resolve => setTimeout(resolve, delay));
228}

}

await Actor.init();

Actor.main(async () => { const input = await Actor.getInput();

1const crawler = new SpotifyAlbumsScraper();
2await crawler.run(input);

});

Developer
Maintained by Community

Actor Metrics

  • 2 monthly users

  • 0 No stars yet

  • >99% runs succeeded

  • Created in Oct 2024

  • Modified 4 months ago

Categories