Alibaba Scraper

No credit card required

This Actor is under maintenance.

This actor is under maintenance and it may unreliable.

Alibaba Scraper

Alibaba Scraper

lexis-solutions/alibaba-scraper

No credit card required

The Apify Alibaba Scraper is an efficient web crawling tool designed to scrape Alibaba, extracting product information, prices, and reviews. This crawler streamlines data collection by quickly crawling and scraping content, providing valuable insights for research and analysis.

.dockerignore

1# configurations 2.idea 3 4# crawlee and apify storage folders 5apify_storage 6crawlee_storage 7storage 8 9# installed files 10node_modules 11

.gitignore

1# This file tells Git which files shouldn't be added to source control 2 3.idea 4dist 5node_modules 6apify_storage 7crawlee_storage 8storage 9

Dockerfile

1# Specify the base Docker image. You can read more about 2# the available images at https://crawlee.dev/docs/guides/docker-images 3# You can also use any other image from Docker Hub. 4FROM apify/actor-node-puppeteer-chrome:16 AS builder 5 6# Copy just package.json and yarn.lock 7# to speed up the build using Docker layer cache. 8COPY --chown=myuser package.json yarn.lock ./ 9 10# Install all dependencies. Don't audit to speed up the installation. 11RUN yarn --production=false 12 13# Next, copy the source files using the user set 14# in the base image. 15COPY --chown=myuser . ./ 16 17# Install all dependencies and build the project. 18# Don't audit to speed up the installation. 19RUN yarn build 20 21# Create final image 22FROM apify/actor-node-puppeteer-chrome:16 23 24# Copy only built JS files from builder image 25COPY --from=builder --chown=myuser /home/myuser/dist ./dist 26 27# Copy just package.json and yarn.lock 28# to speed up the build using Docker layer cache. 29COPY --chown=myuser package*.json yarn.lock ./ 30 31# Install NPM packages, skip optional and development dependencies to 32# keep the image small. Avoid logging too much and print the dependency 33# tree for debugging 34RUN yarn --prod \ 35 && echo "Installed NPM packages:" \ 36 && (yarn list --depth 1 || true) \ 37 && echo "Node.js version:" \ 38 && node --version \ 39 && echo "Yarn version:" \ 40 && yarn --version 41 42# Next, copy the remaining files and directories with the source code. 43# Since we do this after NPM install, quick build will be really fast 44# for most source file changes. 45COPY --chown=myuser . ./ 46 47 48# Run the image. If you know you won't need headful browsers, 49# you can remove the XVFB start script for a micro perf gain. 50CMD ./start_xvfb_and_run_cmd.sh && yarn start:prod --silent

apify.json

1{ 2 "name": "alibaba-scraper", 3 "version": "0.0", 4 "buildTag": "latest", 5 "env": null, 6 "$schema": { 7 "startUrl": {} 8 } 9} 10

package.json

1{ 2 "name": "alibaba", 3 "version": "0.0.1", 4 "dependencies": { 5 "@crawlee/puppeteer": "3.3.0", 6 "apify": "3.1.2", 7 "crawlee": "^3.2.2", 8 "prettier": "^2.8.8", 9 "puppeteer": "^19.7.2" 10 }, 11 "devDependencies": { 12 "@apify/tsconfig": "0.1.0", 13 "@types/node": "18.15.0", 14 "ts-node": "10.8.0", 15 "typescript": "4.7.4" 16 }, 17 "scripts": { 18 "dev": "apify run", 19 "start": "ts-node-esm -T src/main.ts", 20 "start:prod": "node dist/main.js", 21 "build": "tsc", 22 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" 23 }, 24 "license": "ISC" 25} 26

tsconfig.json

1{ 2 "extends": "@apify/tsconfig", 3 "compilerOptions": { 4 "module": "CommonJS", 5 "target": "ES2022", 6 "moduleResolution": "node", 7 "outDir": "dist", 8 "lib": ["DOM"], 9 "noImplicitAny": false, 10 "noUnusedLocals": false, 11 "noUnusedParameters": false, 12 "esModuleInterop": true 13 }, 14 "include": ["./src/**/*"] 15} 16

.actor/actor.json

1{ 2 "actorSpecification": 1, 3 "name": "alibaba-scraper", 4 "title": "Alibaba scraper", 5 "version": "0.0", 6 "input": "./input.json", 7 "storages": { 8 "dataset": "./output.json" 9 } 10} 11

.actor/input.json

1{ 2 "title": "Alibaba Scraper", 3 "description": "This is alibaba scraper input schema", 4 "type": "object", 5 "schemaVersion": 1, 6 "properties": { 7 "startUrls": { 8 "title": "Start URLs", 9 "type": "array", 10 "description": "URLs to scrape", 11 "editor": "requestListSources", 12 "prefill": [ 13 { 14 "url": "https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&categoryId=201153401&keywords=Groom+Wear&knowledgeGraphId=100010232-10000166340&viewtype=L&&pricef=80&pricet" 15 } 16 ] 17 }, 18 "maxItems": { 19 "title": "Max Items", 20 "type": "integer", 21 "description": "The number of items to be scraped.", 22 "prefill": 1 23 }, 24 "proxyConfiguration": { 25 "title": "Proxy Configuration", 26 "type": "object", 27 "description": "Your proxy configuration from Apify", 28 "editor": "proxy" 29 } 30 }, 31 "required": ["startUrls"] 32} 33

.actor/output.json

1{ 2 "actorSpecification": 1, 3 "fields": {}, 4 "views": { 5 "overview": { 6 "title": "Overview", 7 "transformation": { 8 "fields": [ 9 "name", 10 "price", 11 "brand", 12 "link" 13 ] 14 }, 15 "display": { 16 "component": "table", 17 "properties": { 18 "type": "object", 19 "properties": { 20 "name": { 21 "type": "string", 22 "description": "The name of the product", 23 "label" : "Name" 24 }, 25 "price": { 26 "type": "string", 27 "label" : "Price" 28 }, 29 "brand": { 30 "type": "string", 31 "description": "The brand of the product", 32 "label" : "Brand" 33 }, 34 "link" : { 35 "type" : "string", 36 "description" : "Link to product", 37 "label" : "Link" 38 } 39 } 40 } 41 } 42 } 43 } 44} 45

src/main.ts

1import { Actor } from 'apify'; 2import { PuppeteerCrawler } from 'crawlee'; 3 4import { defaultRequestHandler, getStartUrlsArray } from './routes'; 5 6const disallowedDomains = [ 7 'cookielaw.org', 8 'cdn.cookielaw.org', 9 'googletagmanager.com', 10 'google-analytics.com', 11 'connect.facebook.netsa', 12 'cdn.cookielaw.org', 13 'analytics.tiktok.com', 14 'bat.bing.com', 15 'accounts.google.com', 16 'facebook.com', 17 'adservice.google.com', 18 'gj.mmstat.com', 19 'img.alicdn.com', 20]; 21 22Actor.main(async () => { 23 const input: any = await Actor.getInput(); 24 25 const proxyConfiguration = await Actor.createProxyConfiguration( 26 input.proxyConfiguration 27 ); 28 const startUrls: any = getStartUrlsArray(input.startUrls); 29 30 const crawler = new PuppeteerCrawler({ 31 requestHandler: defaultRequestHandler, 32 preNavigationHooks: [ 33 async ({ addInterceptRequestHandler }) => { 34 await addInterceptRequestHandler((request) => { 35 const requestUrl = request.url(); 36 if (disallowedDomains.some((domain) => requestUrl.includes(domain))) { 37 return request.abort(); 38 } 39 return request.continue(); 40 }); 41 }, 42 ], 43 headless: process.env.ACTOR_RUN_ID !== undefined, 44 proxyConfiguration, 45 }); 46 47 await crawler.run(startUrls); 48}); 49

src/routes.ts

1import { Actor } from 'apify'; 2import { Dataset, PuppeteerRequestHandler } from 'crawlee'; 3 4enum Label { 5 'detailPage', 6 'searchPage', 7} 8 9export const detailPageHandler: PuppeteerRequestHandler = async (args) => { 10 const { page, request } = args; 11 let data = await page.evaluate(() => { 12 const ldJson: any = document.querySelector( 13 'script[type="application/ld+json"]' 14 ); 15 return JSON.parse(ldJson.textContent); 16 }); 17 18 if (!data) { 19 throw new Error('Not data found!'); 20 } 21 22 if (data.length > 1) { 23 data = data[0]; 24 } 25 const link = request.url; 26 const name = data?.name; 27 const sku = data?.sku; 28 const description = data?.description; 29 const image = data?.image; 30 const brand = data?.brand?.name; 31 let price = data?.offers?.price; 32 let reviewRatingValue; 33 const labels = await page.evaluate(() => { 34 const leadTimeElem = document.querySelector('.lead-time'); 35 36 if (leadTimeElem) { 37 let readyToShip, inStock, fastDispatch; 38 39 const preIconElem: any = leadTimeElem.querySelector('.pre-icon'); 40 const iconInsElem: any = leadTimeElem.querySelector('.icon-ins'); 41 42 if (preIconElem) { 43 readyToShip = preIconElem.textContent.trim(); 44 } 45 46 if (iconInsElem) { 47 const fastDispatchElem = iconInsElem.querySelector( 48 '.detail-next-icon-success' 49 ); 50 if (fastDispatchElem) { 51 fastDispatch = fastDispatchElem.nextSibling.textContent.trim(); 52 } 53 inStock = iconInsElem.childNodes[1].textContent.trim(); 54 } 55 if (readyToShip && inStock && fastDispatch) 56 return [readyToShip, inStock, fastDispatch]; 57 } 58 return null; 59 }); 60 61 if (data.review) { 62 reviewRatingValue = data?.review[0]?.reviewRating?.ratingValue; 63 } 64 65 const additionalPhotosArray = await page.evaluate(() => { 66 const slickTrack: any = document.querySelector('.detail-next-slick-track'); 67 const imgElements = slickTrack.querySelectorAll('img'); 68 const links: any = []; 69 70 imgElements.forEach(function (imgElement) { 71 const link = imgElement.getAttribute('src'); 72 links.push(link); 73 }); 74 return links; 75 }); 76 77 let unitPricing: any, princeRange: any; 78 79 if (price) { 80 const prices: any = []; 81 const quantities: any = []; 82 83 const result = await page.evaluate( 84 (prices, quantities) => { 85 let priceItems = document.querySelectorAll('.price-item'); 86 let isLadderPrice = false; 87 88 if (!priceItems.length) { 89 priceItems = document.querySelectorAll( 90 '[data-role="ladder-price-item"]' 91 ); 92 isLadderPrice = true; 93 } 94 priceItems.forEach((item) => { 95 let quantityText, priceText; 96 97 if (isLadderPrice) { 98 const quantityRange: any = item.querySelector('.ma-quantity-range'); 99 quantityText = quantityRange?.textContent; 100 const priceVal: any = item.querySelector('.priceVal'); 101 priceText = priceVal.getAttribute('title').substring(1); 102 } else { 103 const quantityElement: any = item.querySelector('.quality'); 104 quantityText = quantityElement?.textContent; 105 const priceElement: any = item.querySelector('.price span'); 106 priceText = priceElement?.textContent.substring(1); 107 } 108 quantities.push(quantityText); 109 prices.push(priceText); 110 }); 111 112 return { quantities, prices }; 113 }, 114 prices, 115 quantities 116 ); 117 118 unitPricing = result.quantities.map((quantity, i) => { 119 const range = quantity?.match(/\d+/g).map(Number); 120 const price = result.prices[i]; 121 return { 122 minUnits: range?.[0] || null, 123 maxUnits: range?.[1] || null, 124 priceString: price || null, 125 price: Number(price), 126 }; 127 }); 128 } 129 130 if (!price) { 131 [unitPricing, price, princeRange] = await page.evaluate(() => { 132 const pricePromotionElement: any = document.querySelector('.promotion'); // wholesale cases 133 const priceElement: any = document.querySelector( 134 '.promotion-price strong.normal' 135 ); 136 137 if (pricePromotionElement) { 138 const priceList: any = document.querySelector('.price-list'); // promotion sales 139 const priceItems = priceList.querySelectorAll('.price-item'); 140 const unitPricing: any = []; 141 142 priceItems.forEach((item) => { 143 const qualityText = item.querySelector('.quality').textContent; 144 const price = item.querySelector('.price span').textContent; 145 const range = qualityText.match(/\d+/g).map(Number); 146 unitPricing.push({ 147 minUnits: range[0], 148 maxUnits: range[1], 149 priceString: price, 150 price: Number(price.replace('$', '')), 151 }); 152 }); 153 154 return [unitPricing]; 155 } else if (priceElement) { 156 // piece sale cases 157 const priceText = priceElement.textContent; 158 return [null, priceText]; 159 } else { 160 const priceRange: any = document 161 .querySelector('.price-range .price') 162 ?.textContent?.split(' - '); 163 const minOrderString: any = document 164 .querySelector('.price-range .moq') 165 ?.textContent?.replace('piece/pieces', '') 166 .trim(); 167 168 return [ 169 null, 170 null, 171 { 172 rangeMinString: priceRange[0], 173 rangeMin: priceRange[0] 174 ? Number(priceRange[0]?.replace('$', '')) 175 : undefined, 176 rangeMaxString: priceRange[1], 177 rangeMax: priceRange[1] 178 ? Number(priceRange[1]?.replace('$', '')) 179 : undefined, 180 minOrderString, 181 minOrder: Number(minOrderString), 182 }, 183 ]; 184 } 185 186 return ['???', null, null]; 187 }); 188 } 189 const dataObject = { 190 name, 191 sku, 192 labels: labels || null, 193 priceString: price, 194 price: Number(price), 195 princeRange, 196 unitPricing, 197 brand, 198 link, 199 description: description || null, 200 image, 201 reviewRatingValue: reviewRatingValue || null, 202 additionalPhotosArray, 203 }; 204 const filteredDataObject = Object.entries(dataObject).reduce( 205 (obj, [key, value]) => { 206 if (value !== null) { 207 obj[key] = value; 208 } 209 return obj; 210 }, 211 {} 212 ); 213 await Dataset.pushData(filteredDataObject); 214}; 215 216export const defaultRequestHandler: PuppeteerRequestHandler = async (args) => { 217 const { enqueueLinks, request, log, page } = args; 218 const input: any = await Actor.getInput(); 219 const maxItems = input?.maxItems; 220 const label: Label = request.userData.label; 221 const itemsAdded: number = request.userData.itemsAdded || 0; 222 223 log.info(`crawling ${request.url}`); 224 225 switch (label) { 226 case Label.detailPage: { 227 return detailPageHandler(args); 228 } 229 case Label.searchPage: { 230 const links = await page.evaluate(() => { 231 const elements = document.querySelectorAll( 232 'div.list-no-v2-outter.J-offer-wrapper.traffic-product-card > div > div > a[href]' 233 ); 234 return Array.from(elements).map((element: any) => element.href); 235 }); 236 await enqueueLinks({ 237 urls: maxItems ? links.slice(0, itemsAdded) : links, 238 userData: { 239 label: Label.detailPage, 240 }, 241 }); 242 return; 243 } 244 245 default: { 246 if (request.url.includes('/product-detail/')) { 247 return detailPageHandler(args); 248 } 249 const data = await page.evaluate(() => { 250 const buttonArray = document.querySelectorAll( 251 'div.seb-pagination__pages > a[href]' 252 ); 253 const lastButton: any = buttonArray[buttonArray.length - 1]; 254 if (lastButton) { 255 return lastButton.getAttribute('href').match(/page=(\d+)/)[1] || null; 256 } else { 257 return null; 258 } 259 }); 260 261 const hasMaxItems = Boolean(maxItems); 262 const hasData = Boolean(data); 263 264 let numPages = hasData ? Math.min(data, Math.ceil(maxItems / 48)) : 1; 265 if (!numPages) numPages = data; 266 const searchPagesArray = Array.from( 267 { length: numPages }, 268 (_, i) => request.url + `&page=${i + 1}` 269 ); 270 271 let items = hasMaxItems ? maxItems : numPages * 48; 272 await Promise.all( 273 searchPagesArray.map((element) => { 274 const userData = { 275 label: Label.searchPage, 276 itemsAdded: items, 277 }; 278 items -= 48; 279 return enqueueLinks({ 280 urls: [element], 281 userData: userData, 282 }); 283 }) 284 ); 285 } 286 } 287}; 288 289export const getStartUrlsArray = (startUrls) => { 290 if (startUrls) { 291 return startUrls.map(({ url }) => { 292 return url; 293 }); 294 } 295}; 296
Developer
Maintained by Community
Actor stats
  • 42 users
  • 315 runs
  • Modified 4 months ago
Categories

You might also like these Actors