1
2
3
4
5
6
7
8import { Actor } from 'apify';
9
10import { BasicCrawler } from 'crawlee';
11import { gotScraping } from 'got-scraping';
12import cheerio from 'cheerio';
13import { FingerprintGenerator } from 'fingerprint-generator';
14
15
16const UK_VAT_NUMBER_REGEX = /^(GB|XI)?([0-9]{9}([0-9]{3})?$|(GD|HA)[0-9]{3}$)/;
17
18
19await Actor.init();
20const input = await Actor.getInput();
21
22const vatIdsToValidate = [];
23for (const vatId of input.vatIds) {
24
25 if (!vatId.match(UK_VAT_NUMBER_REGEX)) {
26 await Actor.pushData({ vatId, isValid: false, checkedAt: new Date(), address: null, businessName: null, status: 'error' });
27 } else {
28 vatIdsToValidate.push(vatId);
29 }
30}
31
32const startUrls = vatIdsToValidate.map((vatId) => ({
33 url: 'https://www.tax.service.gov.uk/check-vat-number/enter-vat-details',
34 uniqueKey: vatId,
35 userData: { vatId, label: 'start' },
36}));
37
38const fingerprintGenerator = new FingerprintGenerator();
39const proxyConfiguration = await Actor.createProxyConfiguration();
40
41const crawler = new BasicCrawler({
42 useSessionPool: true,
43 async requestHandler({ request, session, log }) {
44 const browserFingerprintWithHeaders = fingerprintGenerator.getFingerprint({
45 devices: ['desktop'],
46 browsers: ['chrome'],
47 });
48
49 const proxyUrl = await proxyConfiguration.newUrl();
50 const response = await gotScraping({
51 url: 'https://www.tax.service.gov.uk/check-vat-number/enter-vat-details',
52 method: 'GET',
53 proxyUrl,
54 http2: true,
55 headers: {
56 ...browserFingerprintWithHeaders.headers,
57 },
58 });
59
60 let $ = cheerio.load(response.body);
61 const token = $('input[name="csrfToken"]').val();
62 const payload = {
63 csrfToken: token,
64 target: request.userData.vatId,
65 requester: '',
66 };
67 await session.setCookiesFromResponse(response);
68
69 let postResponse;
70
71
72
73 try {
74 await gotScraping({
75 url: 'https://www.tax.service.gov.uk/check-vat-number/enter-vat-details',
76 form: payload,
77 method: 'POST',
78 proxyUrl,
79 http2: true,
80 maxRedirects: 0,
81 headers: {
82 ...browserFingerprintWithHeaders.headers,
83 Cookie: session.getCookieString('https://www.tax.service.gov.uk'),
84 'Content-Type': 'application/x-www-form-urlencoded',
85 Referer: 'https://www.tax.service.gov.uk/check-vat-number/enter-vat-details'
86 },
87 });
88 } catch (err) {
89 if (!err.response.headers.location) throw err;
90 postResponse = err.response;
91 }
92
93 if (!postResponse) throw new Error('VAT number check failed: Post to gov.uk failed');
94 await session.setCookiesFromResponse(postResponse);
95
96 log.info('Fetching result', { location: postResponse.headers.location });
97 const checkResponse = await gotScraping({
98 url: `https://www.tax.service.gov.uk${postResponse.headers.location}`,
99 proxyUrl,
100 http2: true,
101 maxRedirects: 1,
102 headers: {
103 ...browserFingerprintWithHeaders.headers,
104 Cookie: session.getCookieString('https://www.tax.service.gov.uk'),
105 },
106 });
107 $ = cheerio.load(checkResponse.body);
108
109 const isInvalid = $('h1.govuk-heading-xl').text().toLowerCase().includes('invalid uk vat number');
110 const isValid = $('h1.govuk-panel__title').text().toLowerCase().includes('valid uk vat number');
111 if (!isInvalid && !isValid) {
112 await Actor.setValue(`debug-${request.userData.vatId}`, checkResponse.body, { contentType: 'text/html'});
113 throw new Error('Could not check VAT number: Unknown response');
114 }
115
116 const result = {
117 isValid,
118 checkedAt: new Date(),
119 vatId: request.userData.vatId,
120 businessName: null,
121 address: null,
122 status: 'success',
123 }
124
125 const subHeaders = $('h3.govuk-heading-s');
126 subHeaders.each((i, el) => {
127 if ($(el).text().includes('Registered business name')) {
128 result.businessName = $(el).next().text().trim();
129 }
130 if ($(el).text().includes('Registered business address')) {
131 result.address = $(el).next().text().trim();
132 }
133 });
134
135 await Actor.pushData(result);
136
137 },
138 async failedRequestHandler({ request }, err) {
139 await Actor.pushData({
140 isValid: null,
141 status: 'error',
142 checkedAt: new Date(),
143 vatId: request.userData.vatId,
144 businessName: null,
145 address: null,
146 error: err.message,
147 });
148 }
149});
150
151await crawler.run(startUrls);
152
153
154await Actor.exit();