trustpilot
DeprecatedView all Actors
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative Actorstrustpilot
enco/trustpilot
extract data from truspilot in a simple way configure it to obtain only the ratings that interest you.
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "extends": "@apify"
3}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.idea
4node_modules
5
6apify_storage
Dockerfile
1# First, specify the base Docker image. You can read more about
2# the available images at https://sdk.apify.com/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:16
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --only=prod --no-optional --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Next, copy the remaining files and directories with the source code.
23# Since we do this after NPM install, quick build will be really fast
24# for most source file changes.
25COPY . ./
26
27# Optionally, specify how to launch the source code of your actor.
28# By default, Apify's base Docker images define the CMD instruction
29# that runs the Node.js source code using the command specified
30# in the "scripts.start" section of the package.json file.
31# In short, the instruction looks something like this:
32#
33# CMD npm start
INPUT_SCHEMA.json
1{
2 "title": "trustpilot scraper schema",
3 "description": "The input schema",
4 "type": "object",
5 "schemaVersion": 1,
6 "properties": {
7 "startUrls": {
8 "title": "Start URL of the commerce",
9 "type": "string",
10 "description": "URL to start with.",
11 "editor": "textfield",
12 "prefill":"https://www.trustpilot.com/review/carsandbids.com"
13 },
14 "startReviews": {
15 "title": "Start Reviews",
16 "type": "array",
17 "description": "Enter the start reviews",
18 "prefill": [
19 5
20 ],
21 "editor": "stringList",
22 "sectionCaption": "Add the start reviews",
23 "sectionDescription": "The reviews ratings (Excellent = 5, Great=4, Avarage= 3, Poor = 2, Bad = 0 ) you can choose alls or one but don't leave the array empty.",
24 "placeholderKey": "Add a start review",
25 "placeholderValue": "Add a start review",
26 "maxItems": 5
27 },
28 "maxItems": {
29 "title": "maxItems",
30 "type": "integer",
31 "description": "Max numbers of items",
32 "nullable": true
33 }
34 },
35 "required": [
36 "startUrls"
37 ]
38}
apify.json
1{
2 "name": "trustpilot",
3 "version": "0.0",
4 "buildTag": "latest",
5 "env": null,
6 "template": "project_playwright_crawler"
7}
main.js
1const Apify = require('apify');
2// const playwright = require('playwright');
3const { handleStart, handleList, handleDetail } = require('./src/routes');
4
5const { utils: { log } } = Apify;
6
7Apify.main(async () => {
8 const { startUrls } = await Apify.getInput();
9 const requestList = await Apify.openRequestList('start-urls', [
10 {
11 url: startUrls
12 }
13 ]);
14 const requestQueue = await Apify.openRequestQueue();
15 const proxyConfiguration = await Apify.createProxyConfiguration();
16
17 const crawler = new Apify.PlaywrightCrawler({
18 requestList,
19 requestQueue,
20 proxyConfiguration,
21 launchContext: {
22 // To use Firefox or WebKit on the Apify Platform,
23 // don't forget to change the image in Dockerfile
24 // launcher: playwright.firefox,
25 useChrome: true,
26 // We don't have 'stealth' for Playwright yet.
27 // Try using Firefox, it is naturally stealthy.
28 },
29 browserPoolOptions: {
30 // This allows browser to be more effective against anti-scraping protections.
31 // If you are having performance issues try turning this off.
32 useFingerprints: true,
33 },
34 handlePageFunction: async (context) => {
35 const { url, userData: { label } } = context.request;
36 log.info('Page opened.', { label, url });
37 switch (label) {
38 case 'LIST':
39 return handleList(context);
40 case 'DETAIL':
41 return handleDetail(context);
42 default:
43 return handleStart(context, requestQueue);
44 }
45 },
46 });
47
48 log.info('Starting the crawl.');
49 await crawler.run();
50 log.info('Crawl finished.');
51});
package.json
1{
2 "name": "trustpilot",
3 "version": "0.0.1",
4 "description": "This is a boilerplate of an Apify actor.",
5 "dependencies": {
6 "apify": "^2.3.2",
7 "cheerio": "^1.0.0-rc.12",
8 "playwright": "*"
9 },
10 "devDependencies": {
11 "@apify/eslint-config": "^0.1.3",
12 "eslint": "^7.0.0"
13 },
14 "scripts": {
15 "start": "node main.js",
16 "lint": "./node_modules/.bin/eslint src --ext .js,.jsx",
17 "lint:fix": "./node_modules/.bin/eslint src --ext .js,.jsx --fix",
18 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
19 },
20 "author": "It's not you it's me",
21 "license": "ISC"
22}
src/routes.js
1const Apify = require('apify');
2const { load } = require('cheerio');
3const { utils: { log } } = Apify;
4let itemsCounter = 0;
5exports.handleStart = async ({ request, page }, requestQueue) => {
6 const { startReviews, maxItems } = await Apify.getInput();
7 log.info('DEFAULT HANDLER REQUEST')
8 const url = new URL(page.url());
9 url.searchParams.set('languages', 'all')
10 const body = await page.content();
11 ////////////
12 const $ = load(body)
13 if (maxItems) {
14 let pages = Math.ceil(maxItems / 20);
15 //Validate page1 because if put params page=1 the url dont work
16 log.info('Number of pages')
17 log.info(pages)
18 for (let index = 1; index <= pages; index++) {
19 if (index == 1) {
20 url.searchParams.delete('page');
21 } else {
22 url.searchParams.set('page', String(index))
23 }
24 let urlStarts = url.toString();
25 startReviews?.forEach((starRewview) => {
26 urlStarts += `&stars=${starRewview}`
27 })
28 await requestQueue.addRequest({
29 url: urlStarts,
30 userData: {
31 label: 'LIST'
32 }
33 })
34 }
35 } else {
36 const totalPages = Number($('a[data-pagination-button-last-link]').text());
37 log.info('Number of pages to scrape')
38 log.info(totalPages)
39 for (let index = 1; index <= totalPages; index++) {
40 if (index == 1) {
41 url.searchParams.delete('page');
42 }
43 url.searchParams.set('page', String(index))
44 let urlStarts = url.toString();
45 startReviews?.forEach((starRewview) => {
46 urlStarts += `&stars=${starRewview}`
47 })
48 await requestQueue.addRequest({
49 url: urlStarts,
50 userData: {
51 label: 'DETAIL'
52 }
53 })
54 }
55 }
56 ////////////
57 log.info(`Handle Start URLs`);
58};
59
60exports.handleList = async ({ request, page }) => {
61 const { maxItems } = await Apify.getInput();
62 log.info(`Handle pagination`);
63 await page.waitForSelector('section[data-business-unit-reviews-section="true"]')
64 const html = await page.content();
65 //////////
66 const $ = load(html);
67 $('article[data-service-review-card-paper="true"]').each(async (_i, article) => {
68 itemsCounter++;
69 let item = {};
70 item.userName = $(article).find(`div[data-consumer-name-typography="true"]`).text()
71 item.userCountry = $(article).find(`span[data-consumer-country-typography]`).text()
72 item.reviewRating = Number($(article).find('div[data-service-review-rating]').attr('data-service-review-rating'))
73 item.reviewDate = $(article).find(`time[data-service-review-date-time-ago]`).attr('datetime')
74 item.reviewTitle = $(article).find(`a[data-review-title-typography]`).text()
75 item.reviewText = $(article).find(`p[data-service-review-text-typography]`).text()
76 item.verified = $(article).find(`button[data-review-label-tooltip-trigger="true"]`).text()
77 ? true
78 : false
79 item.businessReply = $(article).find('[data-service-review-business-reply-text-typography="true"]').length == 0
80 ? null
81 : $(article).find('[data-service-review-business-reply-text-typography="true"]').text()
82 if (itemsCounter <= maxItems) {
83 await Apify.pushData(item)
84 }
85 })
86 //////
87 log.info('Items push to dataset default')
88};
89
90exports.handleDetail = async ({ request, page }) => {
91 log.info(`Handle pagination`);
92 await page.waitForSelector('section[data-business-unit-reviews-section="true"]')
93 const html = await page.content();
94 const $ = load(html);
95 $('article[data-service-review-card-paper="true"]').each(async (_i, article) => {
96 let item = {};
97 item.userName = $(article).find(`div[data-consumer-name-typography="true"]`).text()
98 item.userCountry = $(article).find(`span[data-consumer-country-typography]`).text()
99 item.reviewRating = $(article).find('div[data-service-review-rating]').attr('data-service-review-rating')
100 item.reviewDate = $(article).find(`time[data-service-review-date-time-ago]`).attr('datetime')
101 item.reviewTitle = $(article).find(`a[data-review-title-typography]`).text()
102 item.reviewText = $(article).find(`p[data-service-review-text-typography]`).text()
103 item.verified = $(article).find(`button[data-review-label-tooltip-trigger="true"]`).text()
104 ? true
105 : false
106 item.businessReply = $(article).find('[data-service-review-business-reply-text-typography="true"]')
107 ? $(article).find('[data-service-review-business-reply-text-typography="true"]').text()
108 : null
109 await Apify.pushData(item)
110 })
111 log.info('Items push to dataset default')
112};
Developer
Maintained by Community
Categories