1/**
2 * This template is a production ready boilerplate for developing with `CheerioCrawler`.
3 * Use this to bootstrap your projects using the most up-to-date code.
4 * If you're looking for examples or want to learn more, see README.
5 */
6
7// For more information, see https://sdk.apify.com
8import { Actor } from 'apify';
9// For more information, see https://crawlee.dev
10import { BasicCrawler } from 'crawlee';
11import { gotScraping } from 'got-scraping';
12import cheerio from 'cheerio';
13import { FingerprintGenerator } from 'fingerprint-generator';
14
15// UK (Standard = 9 digits), (Branches = 12 digits), (Government = GD + 3 digits), (Health authority = HA + 3 digits), "XI" prefix is used for Northern Ireland!
16const UK_VAT_NUMBER_REGEX = /^(GB|XI)?([0-9]{9}([0-9]{3})?$|(GD|HA)[0-9]{3}$)/;
17
18// Initialize the Apify SDK
19await Actor.init();
20const input = await Actor.getInput();
21
22const vatIdsToValidate = [];
23for (const vatId of input.vatIds) {
24 // UK VAT number is 9 or 12 numbers. Could be prefixed by GB.
25 if (!vatId.match(UK_VAT_NUMBER_REGEX)) {
26 await Actor.pushData({ vatId, isValid: false, checkedAt: new Date(), address: null, businessName: null, status: 'error' });
27 } else {
28 vatIdsToValidate.push(vatId);
29 }
30}
31
32const startUrls = vatIdsToValidate.map((vatId) => ({
33 url: 'https://www.tax.service.gov.uk/check-vat-number/enter-vat-details',
34 uniqueKey: vatId,
35 userData: { vatId, label: 'start' },
36}));
37
38const fingerprintGenerator = new FingerprintGenerator();
39
40const proxyConfiguration = await Actor.createProxyConfiguration();
41
42const crawler = new BasicCrawler({
43 useSessionPool: true,
44 async requestHandler({ request, session, log }) {
45 const browserFingerprintWithHeaders = fingerprintGenerator.getFingerprint({
46 devices: ['desktop'],
47 browsers: ['chrome'],
48 });
49
50 const proxyUrl = await proxyConfiguration.newUrl();
51 const response = await gotScraping({
52 url: 'https://www.tax.service.gov.uk/check-vat-number/enter-vat-details',
53 method: 'GET',
54 proxyUrl,
55 http2: true,
56 headers: {
57 ...browserFingerprintWithHeaders.headers,
58 },
59 });
60
61 let $ = cheerio.load(response.body);
62 const token = $('input[name="csrfToken"]').val();
63 const payload = {
64 csrfToken: token,
65 target: request.userData.vatId,
66 requester: '',
67 };
68 await session.setCookiesFromResponse(response);
69
70 let postResponse;
71 // We have to catch the redirect as the response contains set-cookie headers
72 // And gotScraping won't use these cookies in redirect request
73 // That's why this hack with maxRedirects: 0
74 try {
75 await gotScraping({
76 url: 'https://www.tax.service.gov.uk/check-vat-number/enter-vat-details',
77 form: payload,
78 method: 'POST',
79 proxyUrl,
80 http2: true,
81 maxRedirects: 0,
82 headers: {
83 ...browserFingerprintWithHeaders.headers,
84 Cookie: session.getCookieString('https://www.tax.service.gov.uk'),
85 'Content-Type': 'application/x-www-form-urlencoded',
86 Referer: 'https://www.tax.service.gov.uk/check-vat-number/enter-vat-details'
87 },
88 });
89 } catch (err) {
90 if (!err.response.headers.location) throw err;
91 postResponse = err.response;
92 }
93
94 if (!postResponse) throw new Error('VAT number check failed: Post to gov.uk failed');
95 await session.setCookiesFromResponse(postResponse);
96
97 log.info('Fetching result', { location: postResponse.headers.location });
98 const checkResponse = await gotScraping({
99 url: `https://www.tax.service.gov.uk${postResponse.headers.location}`,
100 proxyUrl,
101 http2: true,
102 maxRedirects: 1,
103 headers: {
104 ...browserFingerprintWithHeaders.headers,
105 Cookie: session.getCookieString('https://www.tax.service.gov.uk'),
106 },
107 });
108 $ = cheerio.load(checkResponse.body);
109
110 const isInvalid = $('h1.govuk-heading-xl').text().toLowerCase().includes('invalid uk vat number');
111 const isValid = $('h1.govuk-panel__title').text().toLowerCase().includes('valid uk vat number');
112 if (!isInvalid && !isValid) {
113 await Actor.setValue(`debug-${request.userData.vatId}`, checkResponse.body, { contentType: 'text/html'});
114 throw new Error('Could not check VAT number: Unknown response');
115 }
116
117 const result = {
118 isValid,
119 checkedAt: new Date(),
120 vatId: request.userData.vatId,
121 businessName: null,
122 address: null,
123 status: 'success',
124 }
125
126 const subHeaders = $('h3.govuk-heading-s');
127 subHeaders.each((i, el) => {
128 if ($(el).text().includes('Registered business name')) {
129 result.businessName = $(el).next().text().trim();
130 }
131 if ($(el).text().includes('Registered business address')) {
132 result.address = $(el).next().text().trim();
133 }
134 });
135
136 await Actor.pushData(result);
137
138 },
139 async failedRequestHandler({ request }, err) {
140 await Actor.pushData({
141 isValid: null,
142 status: 'error',
143 checkedAt: new Date(),
144 vatId: request.userData.vatId,
145 businessName: null,
146 address: null,
147 error: err.message,
148 });
149 }
150});
151
152await crawler.run(startUrls);
153
154// Exit successfully
155await Actor.exit();