This Actor is under maintenance.
This actor is under maintenance and it may unreliable.

Alibaba Scraper
lexis-solutions/alibaba-scraper
The Apify Alibaba Scraper is an efficient web crawling tool designed to scrape Alibaba, extracting product information, prices, and reviews. This crawler streamlines data collection by quickly crawling and scraping content, providing valuable insights for research and analysis.
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.idea
4dist
5node_modules
6apify_storage
7crawlee_storage
8storage
9
Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-puppeteer-chrome:16 AS builder
5
6# Copy just package.json and yarn.lock
7# to speed up the build using Docker layer cache.
8COPY package.json yarn.lock ./
9
10# Install all dependencies. Don't audit to speed up the installation.
11RUN yarn --production=false
12
13# Next, copy the source files using the user set
14# in the base image.
15COPY . ./
16
17# Install all dependencies and build the project.
18# Don't audit to speed up the installation.
19RUN yarn build
20
21# Create final image
22FROM apify/actor-node-puppeteer-chrome:16
23
24# Copy only built JS files from builder image
25COPY /home/myuser/dist ./dist
26
27# Copy just package.json and yarn.lock
28# to speed up the build using Docker layer cache.
29COPY package*.json yarn.lock ./
30
31# Install NPM packages, skip optional and development dependencies to
32# keep the image small. Avoid logging too much and print the dependency
33# tree for debugging
34RUN yarn --prod \
35 && echo "Installed NPM packages:" \
36 && (yarn list --depth 1 || true) \
37 && echo "Node.js version:" \
38 && node --version \
39 && echo "Yarn version:" \
40 && yarn --version
41
42# Next, copy the remaining files and directories with the source code.
43# Since we do this after NPM install, quick build will be really fast
44# for most source file changes.
45COPY . ./
46
47
48# Run the image. If you know you won't need headful browsers,
49# you can remove the XVFB start script for a micro perf gain.
50CMD ./start_xvfb_and_run_cmd.sh && yarn start:prod --silent
apify.json
1{
2 "name": "alibaba-scraper",
3 "version": "0.0",
4 "buildTag": "latest",
5 "env": null,
6 "$schema": {
7 "startUrl": {}
8 }
9}
10
package.json
1{
2 "name": "alibaba",
3 "version": "0.0.1",
4 "dependencies": {
5 "@crawlee/puppeteer": "3.3.0",
6 "apify": "3.1.2",
7 "crawlee": "^3.2.2",
8 "prettier": "^2.8.8",
9 "puppeteer": "^19.7.2"
10 },
11 "devDependencies": {
12 "@apify/tsconfig": "0.1.0",
13 "@types/node": "18.15.0",
14 "ts-node": "10.8.0",
15 "typescript": "4.7.4"
16 },
17 "scripts": {
18 "dev": "apify run",
19 "start": "ts-node-esm -T src/main.ts",
20 "start:prod": "node dist/main.js",
21 "build": "tsc",
22 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
23 },
24 "license": "ISC"
25}
26
tsconfig.json
1{
2 "extends": "@apify/tsconfig",
3 "compilerOptions": {
4 "module": "CommonJS",
5 "target": "ES2022",
6 "moduleResolution": "node",
7 "outDir": "dist",
8 "lib": ["DOM"],
9 "noImplicitAny": false,
10 "noUnusedLocals": false,
11 "noUnusedParameters": false,
12 "esModuleInterop": true
13 },
14 "include": ["./src/**/*"]
15}
16
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "alibaba-scraper",
4 "title": "Alibaba scraper",
5 "version": "0.0",
6 "input": "./input.json",
7 "storages": {
8 "dataset": "./output.json"
9 }
10}
11
.actor/input.json
1{
2 "title": "Alibaba Scraper",
3 "description": "This is alibaba scraper input schema",
4 "type": "object",
5 "schemaVersion": 1,
6 "properties": {
7 "startUrls": {
8 "title": "Start URLs",
9 "type": "array",
10 "description": "URLs to scrape",
11 "editor": "requestListSources",
12 "prefill": [
13 {
14 "url": "https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&categoryId=201153401&keywords=Groom+Wear&knowledgeGraphId=100010232-10000166340&viewtype=L&&pricef=80&pricet"
15 }
16 ]
17 },
18 "maxItems": {
19 "title": "Max Items",
20 "type": "integer",
21 "description": "The number of items to be scraped.",
22 "prefill": 1
23 },
24 "proxyConfiguration": {
25 "title": "Proxy Configuration",
26 "type": "object",
27 "description": "Your proxy configuration from Apify",
28 "editor": "proxy"
29 }
30 },
31 "required": ["startUrls"]
32}
33
.actor/output.json
1{
2 "actorSpecification": 1,
3 "fields": {},
4 "views": {
5 "overview": {
6 "title": "Overview",
7 "transformation": {
8 "fields": [
9 "name",
10 "price",
11 "brand",
12 "link"
13 ]
14 },
15 "display": {
16 "component": "table",
17 "properties": {
18 "type": "object",
19 "properties": {
20 "name": {
21 "type": "string",
22 "description": "The name of the product",
23 "label" : "Name"
24 },
25 "price": {
26 "type": "string",
27 "label" : "Price"
28 },
29 "brand": {
30 "type": "string",
31 "description": "The brand of the product",
32 "label" : "Brand"
33 },
34 "link" : {
35 "type" : "string",
36 "description" : "Link to product",
37 "label" : "Link"
38 }
39 }
40 }
41 }
42 }
43 }
44}
45
src/main.ts
1import { Actor } from 'apify';
2import { PuppeteerCrawler } from 'crawlee';
3
4import { defaultRequestHandler, getStartUrlsArray } from './routes';
5
6const disallowedDomains = [
7 'cookielaw.org',
8 'cdn.cookielaw.org',
9 'googletagmanager.com',
10 'google-analytics.com',
11 'connect.facebook.netsa',
12 'cdn.cookielaw.org',
13 'analytics.tiktok.com',
14 'bat.bing.com',
15 'accounts.google.com',
16 'facebook.com',
17 'adservice.google.com',
18 'gj.mmstat.com',
19 'img.alicdn.com',
20];
21
22Actor.main(async () => {
23 const input: any = await Actor.getInput();
24
25 const proxyConfiguration = await Actor.createProxyConfiguration(
26 input.proxyConfiguration
27 );
28 const startUrls: any = getStartUrlsArray(input.startUrls);
29
30 const crawler = new PuppeteerCrawler({
31 requestHandler: defaultRequestHandler,
32 preNavigationHooks: [
33 async ({ addInterceptRequestHandler }) => {
34 await addInterceptRequestHandler((request) => {
35 const requestUrl = request.url();
36 if (disallowedDomains.some((domain) => requestUrl.includes(domain))) {
37 return request.abort();
38 }
39 return request.continue();
40 });
41 },
42 ],
43 headless: process.env.ACTOR_RUN_ID !== undefined,
44 proxyConfiguration,
45 });
46
47 await crawler.run(startUrls);
48});
49
src/routes.ts
1import { Actor } from 'apify';
2import { Dataset, PuppeteerRequestHandler } from 'crawlee';
3
4enum Label {
5 'detailPage',
6 'searchPage',
7}
8
9export const detailPageHandler: PuppeteerRequestHandler = async (args) => {
10 const { page, request } = args;
11 let data = await page.evaluate(() => {
12 const ldJson: any = document.querySelector(
13 'script[type="application/ld+json"]'
14 );
15 return JSON.parse(ldJson.textContent);
16 });
17
18 if (!data) {
19 throw new Error('Not data found!');
20 }
21
22 if (data.length > 1) {
23 data = data[0];
24 }
25 const link = request.url;
26 const name = data?.name;
27 const sku = data?.sku;
28 const description = data?.description;
29 const image = data?.image;
30 const brand = data?.brand?.name;
31 let price = data?.offers?.price;
32 let reviewRatingValue;
33 const labels = await page.evaluate(() => {
34 const leadTimeElem = document.querySelector('.lead-time');
35
36 if (leadTimeElem) {
37 let readyToShip, inStock, fastDispatch;
38
39 const preIconElem: any = leadTimeElem.querySelector('.pre-icon');
40 const iconInsElem: any = leadTimeElem.querySelector('.icon-ins');
41
42 if (preIconElem) {
43 readyToShip = preIconElem.textContent.trim();
44 }
45
46 if (iconInsElem) {
47 const fastDispatchElem = iconInsElem.querySelector(
48 '.detail-next-icon-success'
49 );
50 if (fastDispatchElem) {
51 fastDispatch = fastDispatchElem.nextSibling.textContent.trim();
52 }
53 inStock = iconInsElem.childNodes[1].textContent.trim();
54 }
55 if (readyToShip && inStock && fastDispatch)
56 return [readyToShip, inStock, fastDispatch];
57 }
58 return null;
59 });
60
61 if (data.review) {
62 reviewRatingValue = data?.review[0]?.reviewRating?.ratingValue;
63 }
64
65 const additionalPhotosArray = await page.evaluate(() => {
66 const slickTrack: any = document.querySelector('.detail-next-slick-track');
67 const imgElements = slickTrack.querySelectorAll('img');
68 const links: any = [];
69
70 imgElements.forEach(function (imgElement) {
71 const link = imgElement.getAttribute('src');
72 links.push(link);
73 });
74 return links;
75 });
76
77 let unitPricing: any, princeRange: any;
78
79 if (price) {
80 const prices: any = [];
81 const quantities: any = [];
82
83 const result = await page.evaluate(
84 (prices, quantities) => {
85 let priceItems = document.querySelectorAll('.price-item');
86 let isLadderPrice = false;
87
88 if (!priceItems.length) {
89 priceItems = document.querySelectorAll(
90 '[data-role="ladder-price-item"]'
91 );
92 isLadderPrice = true;
93 }
94 priceItems.forEach((item) => {
95 let quantityText, priceText;
96
97 if (isLadderPrice) {
98 const quantityRange: any = item.querySelector('.ma-quantity-range');
99 quantityText = quantityRange?.textContent;
100 const priceVal: any = item.querySelector('.priceVal');
101 priceText = priceVal.getAttribute('title').substring(1);
102 } else {
103 const quantityElement: any = item.querySelector('.quality');
104 quantityText = quantityElement?.textContent;
105 const priceElement: any = item.querySelector('.price span');
106 priceText = priceElement?.textContent.substring(1);
107 }
108 quantities.push(quantityText);
109 prices.push(priceText);
110 });
111
112 return { quantities, prices };
113 },
114 prices,
115 quantities
116 );
117
118 unitPricing = result.quantities.map((quantity, i) => {
119 const range = quantity?.match(/\d+/g).map(Number);
120 const price = result.prices[i];
121 return {
122 minUnits: range?.[0] || null,
123 maxUnits: range?.[1] || null,
124 priceString: price || null,
125 price: Number(price),
126 };
127 });
128 }
129
130 if (!price) {
131 [unitPricing, price, princeRange] = await page.evaluate(() => {
132 const pricePromotionElement: any = document.querySelector('.promotion'); // wholesale cases
133 const priceElement: any = document.querySelector(
134 '.promotion-price strong.normal'
135 );
136
137 if (pricePromotionElement) {
138 const priceList: any = document.querySelector('.price-list'); // promotion sales
139 const priceItems = priceList.querySelectorAll('.price-item');
140 const unitPricing: any = [];
141
142 priceItems.forEach((item) => {
143 const qualityText = item.querySelector('.quality').textContent;
144 const price = item.querySelector('.price span').textContent;
145 const range = qualityText.match(/\d+/g).map(Number);
146 unitPricing.push({
147 minUnits: range[0],
148 maxUnits: range[1],
149 priceString: price,
150 price: Number(price.replace('$', '')),
151 });
152 });
153
154 return [unitPricing];
155 } else if (priceElement) {
156 // piece sale cases
157 const priceText = priceElement.textContent;
158 return [null, priceText];
159 } else {
160 const priceRange: any = document
161 .querySelector('.price-range .price')
162 ?.textContent?.split(' - ');
163 const minOrderString: any = document
164 .querySelector('.price-range .moq')
165 ?.textContent?.replace('piece/pieces', '')
166 .trim();
167
168 return [
169 null,
170 null,
171 {
172 rangeMinString: priceRange[0],
173 rangeMin: priceRange[0]
174 ? Number(priceRange[0]?.replace('$', ''))
175 : undefined,
176 rangeMaxString: priceRange[1],
177 rangeMax: priceRange[1]
178 ? Number(priceRange[1]?.replace('$', ''))
179 : undefined,
180 minOrderString,
181 minOrder: Number(minOrderString),
182 },
183 ];
184 }
185
186 return ['???', null, null];
187 });
188 }
189 const dataObject = {
190 name,
191 sku,
192 labels: labels || null,
193 priceString: price,
194 price: Number(price),
195 princeRange,
196 unitPricing,
197 brand,
198 link,
199 description: description || null,
200 image,
201 reviewRatingValue: reviewRatingValue || null,
202 additionalPhotosArray,
203 };
204 const filteredDataObject = Object.entries(dataObject).reduce(
205 (obj, [key, value]) => {
206 if (value !== null) {
207 obj[key] = value;
208 }
209 return obj;
210 },
211 {}
212 );
213 await Dataset.pushData(filteredDataObject);
214};
215
216export const defaultRequestHandler: PuppeteerRequestHandler = async (args) => {
217 const { enqueueLinks, request, log, page } = args;
218 const input: any = await Actor.getInput();
219 const maxItems = input?.maxItems;
220 const label: Label = request.userData.label;
221 const itemsAdded: number = request.userData.itemsAdded || 0;
222
223 log.info(`crawling ${request.url}`);
224
225 switch (label) {
226 case Label.detailPage: {
227 return detailPageHandler(args);
228 }
229 case Label.searchPage: {
230 const links = await page.evaluate(() => {
231 const elements = document.querySelectorAll(
232 'div.list-no-v2-outter.J-offer-wrapper.traffic-product-card > div > div > a[href]'
233 );
234 return Array.from(elements).map((element: any) => element.href);
235 });
236 await enqueueLinks({
237 urls: maxItems ? links.slice(0, itemsAdded) : links,
238 userData: {
239 label: Label.detailPage,
240 },
241 });
242 return;
243 }
244
245 default: {
246 if (request.url.includes('/product-detail/')) {
247 return detailPageHandler(args);
248 }
249 const data = await page.evaluate(() => {
250 const buttonArray = document.querySelectorAll(
251 'div.seb-pagination__pages > a[href]'
252 );
253 const lastButton: any = buttonArray[buttonArray.length - 1];
254 if (lastButton) {
255 return lastButton.getAttribute('href').match(/page=(\d+)/)[1] || null;
256 } else {
257 return null;
258 }
259 });
260
261 const hasMaxItems = Boolean(maxItems);
262 const hasData = Boolean(data);
263
264 let numPages = hasData ? Math.min(data, Math.ceil(maxItems / 48)) : 1;
265 if (!numPages) numPages = data;
266 const searchPagesArray = Array.from(
267 { length: numPages },
268 (_, i) => request.url + `&page=${i + 1}`
269 );
270
271 let items = hasMaxItems ? maxItems : numPages * 48;
272 await Promise.all(
273 searchPagesArray.map((element) => {
274 const userData = {
275 label: Label.searchPage,
276 itemsAdded: items,
277 };
278 items -= 48;
279 return enqueueLinks({
280 urls: [element],
281 userData: userData,
282 });
283 })
284 );
285 }
286 }
287};
288
289export const getStartUrlsArray = (startUrls) => {
290 if (startUrls) {
291 return startUrls.map(({ url }) => {
292 return url;
293 });
294 }
295};
296
Developer
Maintained by Community
Actor stats
- 42 users
- 315 runs
- Modified 4 months ago
Categories