Google Search By Image avatar
Google Search By Image
Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Google Search By Image

Google Search By Image

pocesar/google-search-by-image

Search Google results by providing image URLs. It can be used for: reverse search, finding people by photo, detecting objects on images, getting product names, finding social media related to the person in the photo, finding related images and terms, and identifying objects on photos

src/main.ts

1import { Actor } from 'apify';
2import { BasicCrawler, log } from 'crawlee';
3import { gotScraping } from 'got-scraping';
4import FormData from 'form-data';
5import { JSDOM } from 'jsdom';
6import { createHash } from 'crypto';
7
8const createValidKVNameFromUrl = (url: string) => createHash('sha256').update(url).digest('hex').slice(0, -2);
9
10const headers: Record<string, string> = {
11    'Alt-Used': 'www.google.com',
12    'Origin': 'https://www.google.com',
13    'Referer': 'https://www.google.com/',
14    'Cache-Control': 'no-cache',
15    'Pragma': "no-cache",
16};
17
18await Actor.init();
19
20interface InputSchema {
21    startUrls: string[];
22    debug?: boolean;
23}
24
25const { startUrls = [], debug, } = await Actor.getInput<InputSchema>() ?? {};
26
27if (debug) {
28    log.setLevel(log.LEVELS.DEBUG);
29}
30
31const proxyConfiguration = await Actor.createProxyConfiguration({
32    groups: ['RESIDENTIAL'],
33    countryCode: 'US'
34});
35
36const newUrl = (imageUrl: string) => {
37    const nUrl = new URL('https://www.google.com.br/searchbyimage');
38
39    nUrl.searchParams.set('image_url', imageUrl);
40    nUrl.searchParams.set('btnG', 'Search by image');
41    nUrl.searchParams.set('encoded_image', '');
42    nUrl.searchParams.set('image_content', '');
43    nUrl.searchParams.set('filename', '');
44    nUrl.searchParams.set('hl', 'en');
45
46    return nUrl.toString();
47}
48
49const { defaultKeyValueStoreId } = Actor.getEnv();
50
51const crawler = new BasicCrawler({
52    maxConcurrency: 3,
53    useSessionPool: true,
54    async requestHandler({ session, request }) {
55        const { userData } = request;
56
57        const response = await gotScraping({
58            url: newUrl(request.url),
59            method: 'GET',
60            proxyUrl: await proxyConfiguration!.newUrl(session!.id),
61            headers,
62            responseType: 'text',
63        } as any);
64
65        const { window } = new JSDOM(response.body, {
66            url: 'https://www.google.com/search/',
67            runScripts: 'dangerously',
68            pretendToBeVisual: true,
69        });
70
71        const { document } = window;
72
73        if (!document.querySelectorAll('#rso').length) {
74            throw new Error(`No results found`);
75        }
76
77        const relatedSearch = document.querySelector<HTMLAnchorElement>('#topstuff a.fKDtNb[href^="/search"]')?.href;
78
79        const matches = Array.from<HTMLDivElement>(document.querySelectorAll<HTMLDivElement>('#search .normal-header ~ .g')).map((div) => {
80            const [info, snippet] = Array.from(div.querySelectorAll('[data-content-feature="1"] > div > span'));
81            const date = info?.querySelector?.('span:nth-child(3)')?.textContent ?? null;
82            
83            return {
84                title: div?.querySelector?.('h3')?.textContent,
85                url: div?.querySelector?.('a')?.href,
86                date: /\d, \d/.test(date) ? date : null,
87                text: snippet?.textContent || null,
88            };
89        });
90
91        const results = Array.from<HTMLDivElement>(document.querySelectorAll('#rso > div:first-child .g')).map((div) => {
92            const [info, snippet] = Array.from(div.querySelectorAll('[data-content-feature="1"] > div > span'));
93
94            return {
95                title: div?.querySelector?.('h3')?.textContent,
96                url: div?.querySelector?.('a')?.href,
97                date: /\d, \d/.test(info?.textContent) ? info.querySelector('span')?.textContent : null,
98                text: snippet?.textContent || null,
99            };
100        });   
101
102        const images = Array.from<HTMLDivElement>(document.querySelectorAll('[data-lpage]')).map((div) => {
103            return {
104                image: '',
105                url: div?.dataset?.['lpage'],
106                imageData: div?.querySelector?.('img')?.src,
107            };
108        }); 
109
110        for (const image of images) {
111            if (!image.imageData || !image.url) {
112                continue;
113            }
114
115            const hash = createValidKVNameFromUrl(image.url);
116            const [, contentType, imageData] = image.imageData.split(/data:|;base64,/);
117
118            await Actor.setValue(
119                hash,
120                Buffer.from(imageData, 'base64'),
121                { contentType }
122            );
123
124            delete image.imageData;
125            image.image = `https://api.apify.com/v2/key-value-stores/${defaultKeyValueStoreId}/records/${hash}`;
126        } 
127
128        await Actor.pushData({
129            relatedSearch: relatedSearch || null,
130            matches,
131            results,
132            images,
133            userData,
134        });
135
136        // await Actor.setValue('OUTPUT', dom.serialize(), { contentType: 'text/plain' });
137    },
138});
139
140await crawler.addRequests(startUrls);
141
142log.info('Starting the crawl.');
143await crawler.run();
144log.info('Crawl finished.');
145
146await Actor.exit();

crawlee_storage/key_value_stores/default/INPUT.json

1{
2    "startUrls": [
3        "https://apify.com"
4    ]
5}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.idea
4dist
5node_modules
6apify_storage
7crawlee_storage

Dockerfile

1# using multistage build, as we need dev deps to build the TS source code
2FROM apify/actor-node:18-beta AS builder
3
4# copy all files, install all dependencies (including dev deps) and build the project
5COPY . ./
6RUN npm install --include=dev \
7    && npm run build
8
9# create final image
10FROM apify/actor-node:18-beta
11# copy only necessary files
12COPY --from=builder /usr/src/app/package.json ./
13COPY --from=builder /usr/src/app/README.md ./
14COPY --from=builder /usr/src/app/dist ./dist
15COPY --from=builder /usr/src/app/apify.json ./apify.json
16COPY --from=builder /usr/src/app/INPUT_SCHEMA.json ./INPUT_SCHEMA.json
17
18# install only prod deps
19RUN npm --quiet set progress=false \
20    && npm install --only=prod --no-optional \
21    && echo "Installed NPM packages:" \
22    && (npm list --only=prod --no-optional --all || true) \
23    && echo "Node.js version:" \
24    && node --version \
25    && echo "NPM version:" \
26    && npm --version
27
28# run compiled code
29CMD npm run start:prod

INPUT_SCHEMA.json

1{
2  "title": "@crawlee/cheerio typescript template",
3  "type": "object",
4  "schemaVersion": 1,
5  "properties": {
6    "startUrls": {
7      "title": "Image URLs",
8      "type": "array",
9      "description": "Image URLs to search for",
10      "editor": "requestListSources",
11      "prefill": [
12        { 
13          "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dc/Steve_Jobs_Headshot_2010-CROP_%28cropped_2%29.jpg/1024px-Steve_Jobs_Headshot_2010-CROP_%28cropped_2%29.jpg",
14          "userData": {
15            "name": "Steve Jobs"
16          }
17        }
18      ]
19    },
20    "debug": {
21      "title": "Debug mode",
22      "type": "boolean",
23      "description": "Enable additional logging",
24      "editor": "checkbox",
25      "default": false
26    }
27  },
28  "required": [
29    "startUrls"
30  ]
31}

apify.json

1{
2    "name": "crawlee-cheerio-typescript",
3    "title": "Crawlee + Cheerio + Typescript",
4    "version": "0.0",
5    "buildTag": "latest"
6}

package.json

1{
2    "name": "crawlee-cheerio-typescript",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is an example of an Apify actor.",
6    "dependencies": {
7        "apify": "^3.0.2",
8        "crawlee": "^3.0.3",
9        "got-scraping": "^3.2.10",
10        "form-data": "^4.0.0"
11    },
12    "devDependencies": {
13        "@apify/tsconfig": "^0.1.0",
14        "ts-node": "^10.9.1",
15        "typescript": "^4.7.4"
16    },
17    "scripts": {
18        "start": "npm run start:dev",
19        "start:prod": "node dist/main.js",
20        "start:dev": "ts-node-esm -T src/main.ts",
21        "build": "tsc",
22        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
23    },
24    "author": "It's not you it's me",
25    "license": "ISC"
26}

payload

1-----------------------------324172699818326226173603017879
2Content-Disposition: form-data; name="image_url"
3
4
5-----------------------------324172699818326226173603017879
6Content-Disposition: form-data; name="encoded_image"; filename=""
7Content-Type: application/octet-stream
8
9
10-----------------------------324172699818326226173603017879
11Content-Disposition: form-data; name="image_content"
12
13base64
14-----------------------------324172699818326226173603017879
15Content-Disposition: form-data; name="filename"
16
17x.jpg
18-----------------------------324172699818326226173603017879
19Content-Disposition: form-data; name="hl"
20
21en-BR
22-----------------------------324172699818326226173603017879--

tsconfig.json

1{
2    "extends": "@apify/tsconfig",
3    "compilerOptions": {
4        "module": "ES2022",
5        "target": "ES2022",
6        "outDir": "dist",
7        "noUnusedLocals": false,
8        "lib": ["DOM"]
9    },
10    "include": [
11        "./src/**/*"
12    ]
13}
Developer
Maintained by Community