UK VAT number checker avatar
UK VAT number checker
Try for free

No credit card required

View all Actors
UK VAT number checker

UK VAT number checker

novotnyj/uk-vat-number-checker
Try for free

No credit card required

This actor uses https://www.tax.service.gov.uk/ to check if UK VAT number is valid or not. It can check multiple VAT numbers in one run. If VAT number is valid then business name and address are fetched.

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://sdk.apify.com/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:16
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image.
30CMD npm start --silent

.actor/INPUT_SCHEMA.json

1{
2    "title": "CheerioCrawler Template",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "vatIds": {
7            "title": "Vat numbers",
8            "type": "array",
9            "description": "UK Vat numbers to check.",
10            "editor": "stringList",
11            "prefill": ["GB123456789"],
12            "example": ["GB123456789"]
13        }
14    },
15    "required": ["vatIds"]
16}

.actor/README.md

1# UK VAT number checker
2
3Performs check of given UK VAT numbers. Uses `https://www.tax.service.gov.uk/check-vat-number/` to perform the check.

.actor/actor.json

1{
2	"actorSpecification": 1,
3	"name": "uk-vat-id-check",
4	"title": "UK VAT number checker",
5	"description": "",
6	"version": "0.0",
7	"input": "./INPUT_SCHEMA.json",
8	"readme": "./README.md",
9	"dockefile": "./Dockerfile",
10	"minMemoryMbytes": 256,
11	"maxMemoryMbytes": 4096,
12	"storages": {
13		"dataset": {
14			"actorSpecification": 1,
15			"title": "UK VAT number check result",
16			"views": {
17				"floridaMan": {
18					"title": "UK VAT number check result",
19					"transformation": {
20						"fields": [
21							"vatId",
22							"isValid",
23							"businessName",
24                            "checkedAt",
25                            "address"
26						]
27					},
28					"display": {
29						"component": "table",
30						"properties": {
31							"vatId": {
32								"label": "VAT number",
33								"format": "string"
34							},
35							"isValid": {
36								"label": "Is valid",
37								"format": "boolean"
38							},
39							"businessName": {
40								"label": "Business name",
41								"format": "string"
42							},
43                            "address": {
44								"label": "Address",
45								"format": "string"
46							},
47                            "checkedAt": {
48								"label": "Checked at",
49								"format": "datetime"
50							}
51						}
52					}
53				}
54			}
55		}
56	}
57}

src/main.js

1/**
2 * This template is a production ready boilerplate for developing with `CheerioCrawler`.
3 * Use this to bootstrap your projects using the most up-to-date code.
4 * If you're looking for examples or want to learn more, see README.
5 */
6
7// For more information, see https://sdk.apify.com
8import { Actor } from 'apify';
9// For more information, see https://crawlee.dev
10import { BasicCrawler } from 'crawlee';
11import { gotScraping } from 'got-scraping';
12import cheerio from 'cheerio';
13import { FingerprintGenerator } from 'fingerprint-generator';
14
15// UK (Standard = 9 digits), (Branches = 12 digits), (Government = GD + 3 digits), (Health authority = HA + 3 digits), "XI" prefix is used for Northern Ireland!
16const UK_VAT_NUMBER_REGEX = /^(GB|XI)?([0-9]{9}([0-9]{3})?$|(GD|HA)[0-9]{3}$)/;
17
18// Initialize the Apify SDK
19await Actor.init();
20const input = await Actor.getInput();
21
22const vatIdsToValidate = [];
23for (const vatId of input.vatIds) {
24    // UK VAT number is 9 or 12 numbers. Could be prefixed by GB.
25    if (!vatId.match(UK_VAT_NUMBER_REGEX)) {
26        await Actor.pushData({ vatId, isValid: false, checkedAt: new Date(), address: null, businessName: null, status: 'error' });
27    } else {
28        vatIdsToValidate.push(vatId);
29    }
30}
31
32const startUrls = vatIdsToValidate.map((vatId) => ({
33    url: 'https://www.tax.service.gov.uk/check-vat-number/enter-vat-details',
34    uniqueKey: vatId,
35    userData: { vatId, label: 'start' },
36}));
37
38const fingerprintGenerator = new FingerprintGenerator();
39
40const proxyConfiguration = await Actor.createProxyConfiguration();
41
42const crawler = new BasicCrawler({
43    useSessionPool: true,
44    async requestHandler({ request, session, log }) {
45        const  browserFingerprintWithHeaders = fingerprintGenerator.getFingerprint({
46            devices: ['desktop'],
47            browsers: ['chrome'],
48        });
49
50        const proxyUrl = await proxyConfiguration.newUrl();
51        const response = await gotScraping({
52            url: 'https://www.tax.service.gov.uk/check-vat-number/enter-vat-details',
53            method: 'GET',
54            proxyUrl,
55            http2: true,
56            headers: {
57                ...browserFingerprintWithHeaders.headers,
58            },
59        });
60    
61        let $ = cheerio.load(response.body);
62        const token = $('input[name="csrfToken"]').val();
63        const payload = {
64            csrfToken: token,
65            target: request.userData.vatId,
66            requester: '',
67        };
68        await session.setCookiesFromResponse(response);
69
70        let postResponse;
71        // We have to catch the redirect as the response contains set-cookie headers
72        // And gotScraping won't use these cookies in redirect request
73        // That's why this hack with maxRedirects: 0
74        try {
75            await gotScraping({
76                url: 'https://www.tax.service.gov.uk/check-vat-number/enter-vat-details',
77                form: payload,
78                method: 'POST',
79                proxyUrl,
80                http2: true,
81                maxRedirects: 0,
82                headers: {
83                    ...browserFingerprintWithHeaders.headers,
84                    Cookie: session.getCookieString('https://www.tax.service.gov.uk'),
85                    'Content-Type': 'application/x-www-form-urlencoded',
86                    Referer: 'https://www.tax.service.gov.uk/check-vat-number/enter-vat-details'
87                },
88            });
89        } catch (err) {
90            if (!err.response.headers.location) throw err;
91            postResponse = err.response;
92        }
93
94        if (!postResponse) throw new Error('VAT number check failed: Post to gov.uk failed');
95        await session.setCookiesFromResponse(postResponse);
96
97        log.info('Fetching result', { location: postResponse.headers.location });
98        const checkResponse = await gotScraping({
99            url: `https://www.tax.service.gov.uk${postResponse.headers.location}`,
100            proxyUrl,
101            http2: true,
102            maxRedirects: 1,
103            headers: {
104                ...browserFingerprintWithHeaders.headers,
105                Cookie: session.getCookieString('https://www.tax.service.gov.uk'),
106            },
107        });
108        $ = cheerio.load(checkResponse.body);
109    
110        const isInvalid = $('h1.govuk-heading-xl').text().toLowerCase().includes('invalid uk vat number');
111        const isValid = $('h1.govuk-panel__title').text().toLowerCase().includes('valid uk vat number');
112        if (!isInvalid && !isValid) {
113            await Actor.setValue(`debug-${request.userData.vatId}`, checkResponse.body, { contentType: 'text/html'});
114            throw new Error('Could not check VAT number: Unknown response');
115        }
116
117        const result = {
118            isValid,
119            checkedAt: new Date(),
120            vatId: request.userData.vatId,
121            businessName: null,
122            address: null,
123            status: 'success',
124        }
125
126        const subHeaders = $('h3.govuk-heading-s');
127        subHeaders.each((i, el) => {
128            if ($(el).text().includes('Registered business name')) {
129                result.businessName = $(el).next().text().trim();
130            }
131            if ($(el).text().includes('Registered business address')) {
132                result.address = $(el).next().text().trim();
133            }
134        });
135
136        await Actor.pushData(result);
137
138    },
139    async failedRequestHandler({ request }, err) {
140        await Actor.pushData({
141            isValid: null,
142            status: 'error',
143            checkedAt: new Date(),
144            vatId: request.userData.vatId,
145            businessName: null,
146            address: null,
147            error: err.message,
148        });
149    }
150});
151
152await crawler.run(startUrls);
153
154// Exit successfully
155await Actor.exit();

src/routes.js

1import { Dataset } from 'crawlee';
2
3export const router = createCheerioRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6    log.info(`enqueueing new URLs`);
7    await enqueueLinks({
8        globs: ['https://apify.com/*'],
9        label: 'detail',
10    });
11});
12
13router.addHandler('vatId', async ({ request, $, log }) => {
14    const title = $('title').text();
15    log.info(`${title}`, { url: request.loadedUrl });
16
17    await Dataset.pushData({
18        url: request.loadedUrl,
19        title,
20    });
21});

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage

package.json

1{
2    "name": "crawlee-cheerio-javascript",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is a boilerplate of an Apify actor.",
6    "engines": {
7        "node": ">=16.0.0"
8    },
9    "dependencies": {
10        "apify": "^3.0.0",
11        "crawlee": "^3.0.0",
12        "got-scraping": "^3.2.0",
13         "cheerio": "1.0.0-rc.12",
14         "fingerprint-generator": "^2.1.10"
15    },
16    "devDependencies": {
17        "@apify/eslint-config": "^0.3.1",
18        "eslint": "^8.20.0"
19    },
20    "scripts": {
21        "start": "node src/main.js",
22        "lint": "eslint ./src --ext .js,.jsx",
23        "lint:fix": "eslint ./src --ext .js,.jsx --fix",
24        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
25    },
26    "author": "It's not you it's me",
27    "license": "ISC"
28}
Developer
Maintained by Community
Actor metrics
  • 1 monthly user
  • 0 stars
  • 100.0% runs succeeded
  • Created in Feb 2023
  • Modified 12 days ago