UK VAT number checker
Try for free
No credit card required
Go to Store
UK VAT number checker
novotnyj/uk-vat-number-checker
Try for free
No credit card required
This actor uses https://www.tax.service.gov.uk/ to check if UK VAT number is valid or not. It can check multiple VAT numbers in one run. If VAT number is valid then business name and address are fetched.
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://sdk.apify.com/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:16
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --omit=dev --omit=optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --omit=dev --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version \
21 && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image.
30CMD npm start --silent
.actor/INPUT_SCHEMA.json
1{
2 "title": "CheerioCrawler Template",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "vatIds": {
7 "title": "Vat numbers",
8 "type": "array",
9 "description": "UK Vat numbers to check.",
10 "editor": "stringList",
11 "prefill": ["GB123456789"],
12 "example": ["GB123456789"]
13 }
14 },
15 "required": ["vatIds"]
16}
.actor/README.md
1# UK VAT number checker
2
3Performs check of given UK VAT numbers. Uses `https://www.tax.service.gov.uk/check-vat-number/` to perform the check.
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "uk-vat-id-check",
4 "title": "UK VAT number checker",
5 "description": "",
6 "version": "0.0",
7 "input": "./INPUT_SCHEMA.json",
8 "readme": "./README.md",
9 "dockefile": "./Dockerfile",
10 "minMemoryMbytes": 256,
11 "maxMemoryMbytes": 4096,
12 "storages": {
13 "dataset": {
14 "actorSpecification": 1,
15 "title": "UK VAT number check result",
16 "views": {
17 "floridaMan": {
18 "title": "UK VAT number check result",
19 "transformation": {
20 "fields": [
21 "vatId",
22 "isValid",
23 "businessName",
24 "checkedAt",
25 "address"
26 ]
27 },
28 "display": {
29 "component": "table",
30 "properties": {
31 "vatId": {
32 "label": "VAT number",
33 "format": "string"
34 },
35 "isValid": {
36 "label": "Is valid",
37 "format": "boolean"
38 },
39 "businessName": {
40 "label": "Business name",
41 "format": "string"
42 },
43 "address": {
44 "label": "Address",
45 "format": "string"
46 },
47 "checkedAt": {
48 "label": "Checked at",
49 "format": "datetime"
50 }
51 }
52 }
53 }
54 }
55 }
56 }
57}
src/main.js
1/**
2 * This template is a production ready boilerplate for developing with `CheerioCrawler`.
3 * Use this to bootstrap your projects using the most up-to-date code.
4 * If you're looking for examples or want to learn more, see README.
5 */
6
7// For more information, see https://sdk.apify.com
8import { Actor } from 'apify';
9// For more information, see https://crawlee.dev
10import { BasicCrawler } from 'crawlee';
11import { gotScraping } from 'got-scraping';
12import cheerio from 'cheerio';
13import { FingerprintGenerator } from 'fingerprint-generator';
14
15// UK (Standard = 9 digits), (Branches = 12 digits), (Government = GD + 3 digits), (Health authority = HA + 3 digits), "XI" prefix is used for Northern Ireland!
16const UK_VAT_NUMBER_REGEX = /^(GB|XI)?([0-9]{9}([0-9]{3})?$|(GD|HA)[0-9]{3}$)/;
17
18// Initialize the Apify SDK
19await Actor.init();
20const input = await Actor.getInput();
21
22const vatIdsToValidate = [];
23for (const vatId of input.vatIds) {
24 // UK VAT number is 9 or 12 numbers. Could be prefixed by GB.
25 if (!vatId.match(UK_VAT_NUMBER_REGEX)) {
26 await Actor.pushData({ vatId, isValid: false, checkedAt: new Date(), address: null, businessName: null, status: 'error' });
27 } else {
28 vatIdsToValidate.push(vatId);
29 }
30}
31
32const startUrls = vatIdsToValidate.map((vatId) => ({
33 url: 'https://www.tax.service.gov.uk/check-vat-number/enter-vat-details',
34 uniqueKey: vatId,
35 userData: { vatId, label: 'start' },
36}));
37
38const fingerprintGenerator = new FingerprintGenerator();
39
40const proxyConfiguration = await Actor.createProxyConfiguration();
41
42const crawler = new BasicCrawler({
43 useSessionPool: true,
44 async requestHandler({ request, session, log }) {
45 const browserFingerprintWithHeaders = fingerprintGenerator.getFingerprint({
46 devices: ['desktop'],
47 browsers: ['chrome'],
48 });
49
50 const proxyUrl = await proxyConfiguration.newUrl();
51 const response = await gotScraping({
52 url: 'https://www.tax.service.gov.uk/check-vat-number/enter-vat-details',
53 method: 'GET',
54 proxyUrl,
55 http2: true,
56 headers: {
57 ...browserFingerprintWithHeaders.headers,
58 },
59 });
60
61 let $ = cheerio.load(response.body);
62 const token = $('input[name="csrfToken"]').val();
63 const payload = {
64 csrfToken: token,
65 target: request.userData.vatId,
66 requester: '',
67 };
68 await session.setCookiesFromResponse(response);
69
70 let postResponse;
71 // We have to catch the redirect as the response contains set-cookie headers
72 // And gotScraping won't use these cookies in redirect request
73 // That's why this hack with maxRedirects: 0
74 try {
75 await gotScraping({
76 url: 'https://www.tax.service.gov.uk/check-vat-number/enter-vat-details',
77 form: payload,
78 method: 'POST',
79 proxyUrl,
80 http2: true,
81 maxRedirects: 0,
82 headers: {
83 ...browserFingerprintWithHeaders.headers,
84 Cookie: session.getCookieString('https://www.tax.service.gov.uk'),
85 'Content-Type': 'application/x-www-form-urlencoded',
86 Referer: 'https://www.tax.service.gov.uk/check-vat-number/enter-vat-details'
87 },
88 });
89 } catch (err) {
90 if (!err.response.headers.location) throw err;
91 postResponse = err.response;
92 }
93
94 if (!postResponse) throw new Error('VAT number check failed: Post to gov.uk failed');
95 await session.setCookiesFromResponse(postResponse);
96
97 log.info('Fetching result', { location: postResponse.headers.location });
98 const checkResponse = await gotScraping({
99 url: `https://www.tax.service.gov.uk${postResponse.headers.location}`,
100 proxyUrl,
101 http2: true,
102 maxRedirects: 1,
103 headers: {
104 ...browserFingerprintWithHeaders.headers,
105 Cookie: session.getCookieString('https://www.tax.service.gov.uk'),
106 },
107 });
108 $ = cheerio.load(checkResponse.body);
109
110 const isInvalid = $('h1.govuk-heading-xl').text().toLowerCase().includes('invalid uk vat number');
111 const isValid = $('h1.govuk-panel__title').text().toLowerCase().includes('valid uk vat number');
112 if (!isInvalid && !isValid) {
113 await Actor.setValue(`debug-${request.userData.vatId}`, checkResponse.body, { contentType: 'text/html'});
114 throw new Error('Could not check VAT number: Unknown response');
115 }
116
117 const result = {
118 isValid,
119 checkedAt: new Date(),
120 vatId: request.userData.vatId,
121 businessName: null,
122 address: null,
123 status: 'success',
124 }
125
126 const subHeaders = $('h3.govuk-heading-s');
127 subHeaders.each((i, el) => {
128 if ($(el).text().includes('Registered business name')) {
129 result.businessName = $(el).next().text().trim();
130 }
131 if ($(el).text().includes('Registered business address')) {
132 result.address = $(el).next().text().trim();
133 }
134 });
135
136 await Actor.pushData(result);
137
138 },
139 async failedRequestHandler({ request }, err) {
140 await Actor.pushData({
141 isValid: null,
142 status: 'error',
143 checkedAt: new Date(),
144 vatId: request.userData.vatId,
145 businessName: null,
146 address: null,
147 error: err.message,
148 });
149 }
150});
151
152await crawler.run(startUrls);
153
154// Exit successfully
155await Actor.exit();
src/routes.js
1import { Dataset } from 'crawlee';
2
3export const router = createCheerioRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6 log.info(`enqueueing new URLs`);
7 await enqueueLinks({
8 globs: ['https://apify.com/*'],
9 label: 'detail',
10 });
11});
12
13router.addHandler('vatId', async ({ request, $, log }) => {
14 const title = $('title').text();
15 log.info(`${title}`, { url: request.loadedUrl });
16
17 await Dataset.pushData({
18 url: request.loadedUrl,
19 title,
20 });
21});
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "extends": "@apify",
3 "root": true
4}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
package.json
1{
2 "name": "crawlee-cheerio-javascript",
3 "version": "0.0.1",
4 "type": "module",
5 "description": "This is a boilerplate of an Apify actor.",
6 "engines": {
7 "node": ">=16.0.0"
8 },
9 "dependencies": {
10 "apify": "^3.0.0",
11 "crawlee": "^3.0.0",
12 "got-scraping": "^3.2.0",
13 "cheerio": "1.0.0-rc.12",
14 "fingerprint-generator": "^2.1.10"
15 },
16 "devDependencies": {
17 "@apify/eslint-config": "^0.3.1",
18 "eslint": "^8.20.0"
19 },
20 "scripts": {
21 "start": "node src/main.js",
22 "lint": "eslint ./src --ext .js,.jsx",
23 "lint:fix": "eslint ./src --ext .js,.jsx --fix",
24 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
25 },
26 "author": "It's not you it's me",
27 "license": "ISC"
28}
Developer
Maintained by Community
Actor Metrics
1 monthly user
-
2 stars
>99% runs succeeded
Created in Feb 2023
Modified 7 months ago
Categories