
Spotify Albums Scraper
2 hours trial then $19.99/month - No credit card required now

Spotify Albums Scraper
2 hours trial then $19.99/month - No credit card required now
Scrape Spotify albums by keywords. Extract comprehensive album data including artist details, cover art, release dates, and playability status. Perfect for music cataloging, album research, and industry analysis.
import { Actor } from 'apify'; import { PuppeteerCrawler } from 'crawlee'; import puppeteerExtra from 'puppeteer-extra'; import stealthPlugin from 'puppeteer-extra-plugin-stealth'; import Redis from "ioredis" import randomUseragent from 'random-useragent'; import fetch from 'node-fetch';
puppeteerExtra.use(stealthPlugin());
// const redisClient = new Redis("rediss://default:AY9xAAIjcDE2YTMyNjQyYTI5ODQ0NDQ3YjVhOTE1YWRiYWEyMTZjN3AxMA@sharing-wren-36721.upstash.io:6379"); // let sessionid = await redisClient.get('instagram-sessionid-cookie');
class SpotifyAlbumsScraper {
1async run(input) { 2 const { keywords, maxItems = Infinity } = input; 3 this.maxItems = maxItems; // 存储maxItems为类属性 4 for(let keyword of keywords) { 5 this.currentSearchUrl = keyword; // 存储当前处理的postUrl 6 const crawler = new PuppeteerCrawler({ 7 launchContext: { 8 launcher: puppeteerExtra, 9 launchOptions: { 10 headless: true, 11 args: [ 12 '--no-sandbox', 13 '--disable-setuid-sandbox', 14 '--disable-dev-shm-usage', 15 '--disable-accelerated-2d-canvas', 16 '--no-first-run', 17 '--no-zygote', 18 '--disable-gpu', 19 '--disable-geolocation', // 添加这一行 20 '--disable-notifications', // 添加这一行 21 `--user-agent=${randomUseragent.getRandom(function (ua) { 22 return ua.browserName === 'Chrome' && parseFloat(ua.browserVersion) >= 80; 23 })}`, 24 ], 25 // Other Puppeteer options 26 }, 27 }, 28 //maxRequestsPerCrawl: maxItems, 29 requestHandlerTimeoutSecs: 3600, // 设置请求处理器超时时间为1小时 30 navigationTimeoutSecs: 300, // 5分钟 31 preNavigationHooks: [ 32 async ({ page }) => { 33 await this.setCookies(page); 34 }, 35 ], 36 requestHandler: async ({ page, request }) => { 37 console.log(`Processing ${request.url}...`); 38 let count = 0; 39 let endOfResults = false; 40 let interceptedData = []; // 初始化局部变量 41 let lastDataLength = 0; 42 let noNewDataCount = 0; 43 const MAX_NO_NEW_DATA_COUNT = 5; // 10秒 / 2秒 = 5次 44 45 await this.setupInterceptors(page, interceptedData); 46 47 await page.goto(request.url, { waitUntil: 'networkidle0' }); 48 49 while (!endOfResults && interceptedData.length < maxItems) { 50 count++; 51 await new Promise(resolve => setTimeout(resolve, 2000)); 52 53 if (endOfResults) { 54 console.log('Reached end of results'); 55 } else if (interceptedData.length === lastDataLength) { 56 noNewDataCount++; 57 // console.log(`No new data for ${noNewDataCount * 5} seconds`); 58 if (noNewDataCount >= MAX_NO_NEW_DATA_COUNT) { 59 // console.log('No new data for 20 seconds, assuming end of results'); 60 endOfResults = true; 61 } 62 } else { 63 noNewDataCount = 0; 64 lastDataLength = interceptedData.length; 65 } 66 } 67 68 console.log(`Saved ${interceptedData.length} items`); 69 }, 70 }); 71 72 await crawler.run([`https://open.spotify.com/search/${keyword}/albums`]); 73 } 74} 75 76async setCookies(page) { 77 await page.setCookie( 78 { 79 name: 'sid_guard', 80 value: 'f917952be638f4225aa82251d42e8d5a%7C1729996200%7C15551986%7CFri%2C+25-Apr-2025+02%3A29%3A46+GMT', 81 domain: '.tiktok.com', 82 path: '/', 83 expires: Math.floor(new Date('2025-11-11').getTime() / 1000), 84 httpOnly: true, 85 secure: true 86 } 87 ); 88} 89 90async setupInterceptors(page, interceptedData) { 91 await page.setRequestInterception(true); 92 page.on('request', (request) => request.continue()); 93 page.on('response', async (response) => { 94 const request = response.request(); 95 const url = request.url(); 96 97 try { 98 if (await this.shouldInterceptRequest(url, request)) { 99 let responseBody = await response.json(); 100 await this.processResponseData(responseBody, interceptedData); 101 102 if (interceptedData.length < this.maxItems) { 103 await new Promise(resolve => setTimeout(resolve, 3000)); 104 105 // 解析当前offset 106 let searchParams = new URL(url)?.searchParams; 107 const variables = JSON.parse(searchParams.get('variables') || '{}'); 108 const currentOffset = parseInt(variables.offset || '0'); 109 const newOffset = currentOffset + 30; 110 111 // 在浏览器环境中发起请求 112 await page.evaluate(async ({ url, currentOffset, newOffset, headers }) => { 113 const searchParams = new URL(url).searchParams; 114 const variables = JSON.parse(searchParams.get('variables')); 115 variables.offset = newOffset; 116 117 const newSearchParams = new URLSearchParams(searchParams); 118 newSearchParams.set('variables', JSON.stringify(variables)); 119 const newUrl = `${url.split('?')[0]}?${newSearchParams.toString()}`; 120 121 // 在浏览器中发起请求 122 await fetch(newUrl, { 123 method: 'GET', 124 headers: headers, 125 credentials: 'include' 126 }); 127 }, { 128 url, 129 currentOffset, 130 newOffset, 131 headers: request.headers() 132 }); 133 } 134 } 135 } catch (error) { 136 console.error(`Error processing response:`, error); 137 } 138 }); 139} 140 141async processResponseData(responseBody, interceptedData) { 142 let currentData = []; 143 responseBody?.data?.searchV2?.albumsV2?.items?.forEach(e => { 144 const data = e?.data; 145 if (!data) return; 146 147 const processedData = { 148 ...data, 149 albumUrl: 'https://open.spotify.com/albums/' + data.uri.replace('spotify:albums:', ''), 150 keyword: this.currentSearchUrl // 保留原有的关键词字段 151 }; 152 153 if (interceptedData.length < this.maxItems) { 154 interceptedData.push(processedData); 155 currentData.push(processedData); 156 } 157 }); 158 159 // 如果超过最大数量,截断currentData 160 if (interceptedData.length > this.maxItems) { 161 const overflow = interceptedData.length - this.maxItems; 162 currentData = currentData.slice(0, -overflow); 163 interceptedData = interceptedData.slice(0, this.maxItems); 164 } 165 166 if (currentData.length > 0) { 167 await Actor.pushData(currentData); 168 console.log(`Pushed ${currentData.length} items`); 169 } 170} 171 172async shouldInterceptRequest(url, request) { 173 if (url.includes('/pathfinder/v1/query') && request.method() === 'GET') { 174 try { 175 let searchParams = new URL(url)?.searchParams; 176 const operationName = searchParams?.get('operationName'); 177 if (operationName === 'searchAlbums') { 178 return true; 179 } 180 } catch (error) { 181 console.error('Error parsing URL:', error); 182 } 183 } 184 return false; 185} 186 187 188async scrollPage(page) { 189 await page.evaluate(() => { 190 return new Promise((resolve) => { 191 192 const presentationDiv = document.querySelector('[data-testid="track-list"] > div:nth-child(2)'); 193 if (!presentationDiv) { 194 console.log('Presentation div not found'); 195 resolve(); 196 return; 197 } 198 199 const initialHeight = presentationDiv.style.height?.replace('px', '') || 1000; 200 const scrollIncrement = 1000; // 每次增加的高度,可根据需要调整 201 202 let currentHeight = initialHeight; 203 const scrollSteps = Math.floor(Math.random() * 3) + 3; // 随机3-5步 204 let scrollCount = 0; 205 206 const scrollInterval = setInterval(() => { 207 if (scrollCount >= scrollSteps) { 208 clearInterval(scrollInterval); 209 resolve(); 210 return; 211 } 212 213 // 增加高度以模拟滚动 214 currentHeight += scrollIncrement + (Math.random() * 200 - 100); // 添加一些随机性 215 presentationDiv.style.height = `${currentHeight}px`; 216 217 scrollCount++; 218 }, 500 + Math.random() * 500); // 随机间隔500-1000毫秒 219 }); 220 }); 221 222} 223 224// 在滚动和其他操作之间添加随机延迟 225async randomDelay(min = 1000, max = 3000) { 226 const delay = Math.floor(Math.random() * (max - min + 1) + min); 227 await new Promise(resolve => setTimeout(resolve, delay)); 228}
}
await Actor.init();
Actor.main(async () => { const input = await Actor.getInput();
1const crawler = new SpotifyAlbumsScraper(); 2await crawler.run(input);
});