Free eBook Scraper avatar
Free eBook Scraper
Try for free

No credit card required

View all Actors
Free eBook Scraper

Free eBook Scraper

epctex/gutenberg-scraper
Try for free

No credit card required

Explore and Download Free eBooks - Find and download a wide selection of free eBooks from Project Gutenberg. Search by keywords and language preferences. Discover literary gems in multiple formats.

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
10# editorconfig-tools is unable to ignore longs strings or urls
11max_line_length = null

.eslintrc

1{
2  "extends": "@apify"
3}

.gitignore

1apify_storage
2node_modules
3package-lock.json

.prettierrc.js

1module.exports = {
2    printWidth: 80,
3    singleQuote: true,
4    trailingComma: 'all',
5};

Dockerfile

1# First, specify the base Docker image. You can read more about
2# the available images at https://sdk.apify.com/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:16
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Next, copy the remaining files and directories with the source code.
23# Since we do this after NPM install, quick build will be really fast
24# for most source file changes.
25COPY . ./
26
27# Optionally, specify how to launch the source code of your actor.
28# By default, Apify's base Docker images define the CMD instruction
29# that runs the Node.js source code using the command specified
30# in the "scripts.start" section of the package.json file.
31# In short, the instruction looks something like this:
32#
33# CMD npm start

INPUT_SCHEMA.json

1{
2    "title": "Gutenberg.org Scraper",
3    "description": "An actor that scrapes ebooks from gutenberg.org",
4    "type": "object",
5    "schemaVersion": 1,
6    "properties": {
7        "search":{
8            "title": "Search keyword",
9            "type": "string",
10            "description": "Keyword that you want to search in Gutenberg.org",
11            "editor": "textfield",
12            "prefill":"book"
13        },
14        "language":{
15            "title": "Language",
16            "type": "string",
17            "description": "Select a language that Gutenberg.org provides",
18            "editor": "select",
19            "default": "none",
20            "enum": ["none", "en","zh","da","nl","eo","fi","fr","de","el","hu","it","la","pt","es","sv","tl","af","ale","ar","arp","brx","br","bg","rmr","ca","ceb","cs","et","fa","fy","fur","gla","gl","kld","grc","he","is","ilo","ia","iu","ga","ja","csb","kha","ko","lt","mi","myn","enm","nah","nap","nav","nai","no","oc","oji","ang","pl","ro","ru","sa","sr","sl","bgs","te","cy"],
21            "enumTitles": ["None", "English","Chinese","Danish","Dutch","Esperanto","Finnish","French","German","Greek","Hungarian","Italian","Latin","Portuguese","Spanish","Swedish","Tagalog","Afrikaans","Aleut","Arabic","Arapaho","Bodo","Breton","Bulgarian","Cal贸","Catalan","Cebuano","Czech","Estonian","Farsi","Frisian","Friulian","Gaelic, Scottish","Galician","Gamilaraay","Greek, Ancient","Hebrew","Icelandic","Iloko","Interlingua","Inuktitut","Irish","Japanese","Kashubian","Khasi","Korean","Lithuanian","Maori","Mayan Languages","Middle English","Nahuatl","Napoletano-Calabrese","Navajo","North American Indian","Norwegian","Occitan","Ojibwa","Old English","Polish","Romanian","Russian","Sanskrit","Serbian","Slovenian","Tagabawa","Telugu","Welsh","Yiddish"]
22        },
23        "startUrls": {
24            "title": "Start URLs",
25            "type": "array",
26            "description": "URLs to start with. It should be category or product detail URLs",
27            "prefill": [
28                { "url": "https://www.gutenberg.org/browse/recent/last7" },
29                { "url": "https://www.gutenberg.org/browse/titles/h" }
30            ],
31            "editor": "requestListSources"
32        },
33        "maxItems":{
34            "title": "Max Items",
35            "type": "integer",
36            "description": "Maximum number of items you want on your results",
37            "editor": "number",
38            "prefill": 5
39        },
40        "extendOutputFunction": {
41            "title": "Extend output function",
42            "type": "string",
43            "nullable": true,
44            "description": "Function that takes a JQuery handle ($) as argument and returns data that will be merged with the default output",
45            "prefill": "($) => { return {} }",
46            "editor": "javascript"
47        },
48        "proxyConfig": {
49            "title": "Proxy configuration",
50            "type": "object",
51            "description": "Select proxies to be used by your crawler.",
52            "prefill": { "useApifyProxy": true },
53            "editor": "proxy"
54        }
55    },
56    "required": ["proxyConfig"]
57}

apify.json

1{
2	"name": "gutenberg-scraper",
3	"version": "0.0",
4	"buildTag": "latest",
5	"template": "basic_crawler"
6}

package.json

1{
2	"name": "gutenberg-scraper",
3	"version": "0.0.1",
4	"description": "Gutenberg.org ebooks scraper.",
5	"dependencies": {
6		"@apify/http-request": "^2.1.2",
7		"apify": "^2.3.2",
8		"cheerio": "^1.0.0-rc.11"
9	},
10	"devDependencies": {
11		"@apify/eslint-config": "^0.3.1",
12		"eslint": "^8.18.0"
13	},
14	"scripts": {
15		"start": "node src/main.js"
16	},
17	"author": "epcsht",
18	"license": "ISC"
19}

src/extractors.js

1// Fetch offers from browse list page
2const fetchEbooksFromBrowse = ($) => {
3    return $('a[href]')
4        .filter((i, el) => $(el).attr('href').match(/ebooks\/[0-9]/g))
5        .map((i, el) => `https://www.gutenberg.org${$(el).attr('href')}`)
6        .get();
7};
8
9// Fetch offers from search list page
10const fetchEbooksFromSearch = ($) => {
11    return $('.booklink a[href]')
12        .filter((i, el) => $(el).attr('href').match(/ebooks\/[0-9]/g))
13        .map((i, el) => `https://www.gutenberg.org${$(el).attr('href')}`)
14        .get();
15};
16
17// Fetch ebook defaults
18const fetchEbook = ($) => {
19    return {
20        author: $('a[itemprop="creator"]').text().trim(),
21        title: $('h1[itemprop="name"]').text().trim(),
22        language: $('tr[itemprop="inLanguage"]').text().trim().replace(/Language/, '')
23            .trim(),
24        htmlURL: $('.files td[property="dcterms:format"]')
25            ? `https://www.gutenberg.org${$('.files td[property="dcterms:format"]')
26                .filter((i, el) => $(el).attr('content').includes('text/html'))
27                .find('a')
28                .attr('href')}` : null,
29        epubURL: $('.files td[content="application/epub+zip"] a').attr('href')
30            ? `https://www.gutenberg.org${$('.files td[content="application/epub+zip"] a').attr('href')}`
31            : null,
32        kindleURL: $('.files td[content="application/x-mobipocket-ebook"] a').attr('href')
33            ? `https://www.gutenberg.org${$('.files td[content="application/x-mobipocket-ebook"] a').attr('href')}`
34            : null,
35        plainTextURL: $('.files td[content="text/plain; charset=utf-8"] a').attr('href')
36            ? `https://www.gutenberg.org${$('.files td[content="text/plain; charset=utf-8"] a').attr('href')}`
37            : null,
38    };
39};
40
41module.exports = {
42    fetchEbooksFromBrowse,
43    fetchEbooksFromSearch,
44    fetchEbook,
45};

src/main.js

1const Apify = require('apify');
2const httpRequest = require('@apify/http-request');
3const cheerio = require('cheerio');
4const tools = require('./tools');
5
6const {
7    utils: { log },
8} = Apify;
9
10// Create crawler
11Apify.main(async () => {
12    log.info('PHASE -- STARTING ACTOR.');
13
14    const input = await Apify.getInput();
15    const { search, language, startUrls, proxyConfig } = input;
16    log.info('ACTOR OPTIONS: -- ', input);
17
18    // Validate input
19    tools.validateInput(input);
20
21    // Create request queue
22    const requestQueue = await Apify.openRequestQueue();
23
24    // Initialize first request
25    const pages = await tools.getSources(input);
26    for (const page of pages) {
27        await requestQueue.addRequest({ ...page });
28    }
29
30    // Create route
31    const router = tools.createRouter({ requestQueue,input });
32
33     // Proxy configuration
34     const proxyConfiguration = await tools.createProxyConfiguration({
35        proxyConfig: input.proxy,
36        required: true,
37    });
38
39    log.info('PHASE -- SETTING UP CRAWLER.');
40
41    const crawler = new Apify.BasicCrawler({
42        requestQueue,
43        handleRequestTimeoutSecs: 120,
44        useSessionPool: true,
45        handleRequestFunction: async (context) => {
46            const { request, session } = context;
47            const { method, headers, payload } = request.userData;
48            log.debug(`CRAWLER -- Processing ${request.url}`);
49
50            // Send request
51            const response = await httpRequest({
52                url: request.url,
53                method: method || 'GET',
54                timeoutSecs: 120,
55                ignoreSslErrors: true,
56                ...(headers ? { headers } : {}),
57                ...(payload ? { payload } : {}),
58                ...(proxyConfiguration ? { proxyUrl: proxyConfiguration.newUrl(session.id) } : {}),
59                abortFunction: (res) => {
60                    // Status code check
61                    if (!res || res.statusCode !== 200) {
62                        return true;
63                    }
64                    return false;
65                },
66            }).catch((err) => {
67                throw new Error(err);
68            });
69
70            // Add to context
71            const data = response.body;
72            const $ = cheerio.load(data);
73
74            context.data = data;
75            context.$ = $;
76
77            const isBlocked = $('h2').text().includes('How can I get unblocked?');
78
79               // Status code check
80            if (isBlocked) {
81                session.markBad();
82                throw new Error(`We got blocked by target on ${request.url}`);
83            }
84
85            // Redirect to route
86            await router(request.userData.label, context);
87        },
88    });
89
90
91    log.info('PHASE -- STARTING CRAWLER.');
92
93    await crawler.run();
94
95    log.info('PHASE -- ACTOR FINISHED.');
96});

src/routes.js

1// routes.js
2const Apify = require('apify');
3const extractors = require('./extractors');
4
5const {
6    utils: { log },
7} = Apify;
8
9
10// ItemCount
11let itemCount = 0;
12
13// BROWSE page crawler.
14// Fetches the ebooks from the list
15exports.BROWSE = async ({ $, request }, { requestQueue }) => {
16    log.info(`CRAWLER -- Browse: Fetching ebooks from the url: ${request.url}`);
17
18
19    // Check for error
20    if ($('html').html().includes('There are too many')) {
21        log.error(`There too many books on website. It doesn't show up: ${request.url}`);
22        process.exit(0);
23    }
24
25    // Fetch all current ebooks listed
26    const ebooks = extractors.fetchEbooksFromBrowse($);
27
28    // Add them to request queue
29    for (const ebook of ebooks) {
30        await requestQueue.addRequest(
31            {
32                url: ebook,
33                userData: {
34                    label: 'EBOOK',
35                },
36            },
37            { forefront: true },
38        );
39    }
40
41    log.debug(`CRAWLER -- ${ebooks.length} ebooks added to queue`);
42};
43
44// SEARCH page crawler.
45// Fetches the ebooks from search page and
46// Adds next page to request queue
47exports.SEARCH = async ({ $, request }, { requestQueue }) => {
48    const { page, baseURL } = request.userData;
49    log.info(`CRAWLER -- Search: Fetching ebooks from the url: ${request.url} page: ${page}`);
50
51    // Fetch all current ebooks listed
52    const ebooks = extractors.fetchEbooksFromSearch($);
53
54    // Add them to request queue
55    for (const ebook of ebooks) {
56        await requestQueue.addRequest(
57            {
58                url: ebook,
59                userData: {
60                    label: 'EBOOK',
61                },
62            },
63            { forefront: true },
64        );
65    }
66
67    // Move to next page
68    if (ebooks.length > 0) {
69        await requestQueue.addRequest({
70            url: `${baseURL}&start_index=${25 * page + 1}`,
71            userData: {
72                label: 'SEARCH',
73                page: page + 1,
74                baseURL,
75            },
76        });
77    }
78
79    log.debug(`CRAWLER -- ${ebooks.length} ebooks added to queue`);
80};
81
82// EBOOK page crawler.
83// Gets details of current ebook
84exports.EBOOK = async ({ $, request },{input}) => {
85    const { extendOutputFunction, maxItems } = input;
86    log.info(`CRAWLER -- Fetching ebook details from ${request.url}`);
87
88    // Check for maxItems
89    if (maxItems && maxItems <= itemCount) {
90        process.exit(0);
91    }
92
93
94    // Fetch ebook details
95    const ebook = extractors.fetchEbook($);
96
97
98    // Push data
99    await Apify.pushData({ ...ebook, ...(extendOutputFunction ? { ...eval(extendOutputFunction)($) } : {}) });
100    itemCount += 1;
101
102    log.debug(`CRAWLER -- ebook details fetched from ${request.url}`);
103};

src/tools.js

1const Apify = require('apify');
2const routes = require('./routes');
3
4const {
5    utils: { log },
6} = Apify;
7
8// Retrieves sources and returns object for request list
9exports.getSources = async (input) => {
10    log.debug('Getting sources');
11
12    // Get user input
13    const { search, language, startUrls } = input;
14
15    // Build start URLs
16    if (language && language !== 'none') {
17        return [{
18            url: `https://www.gutenberg.org/browse/languages/${language}`,
19            userData: {
20                label: 'BROWSE',
21                baseURL: `https://www.gutenberg.org/browse/languages/${language}`,
22            },
23        }];
24    }
25
26    if (search) {
27        return [{
28            url: `https://www.gutenberg.org/ebooks/search/?query=${search.toLowerCase().replace(' ', '+')}`,
29            userData: {
30                label: 'SEARCH',
31                page: 1,
32                baseURL: `https://www.gutenberg.org/ebooks/search/?query=${search.toLowerCase().replace(' ', '+')}`,
33            },
34        }];
35    }
36
37    if (startUrls && startUrls.length > 0) {
38        return startUrls.map((startUrl) => {
39            let label = '';
40            let page = null;
41
42            // Check route dependent on url
43            if (startUrl.url.match(/\/ebooks\/[0-9]/g)) {
44                label = 'EBOOK';
45            }
46
47            if (startUrl.url.match(/\/browse\//g)) {
48                label = 'BROWSE';
49                page = 1;
50            }
51
52            if (startUrl.url.match(/ebooks\/search/g)) {
53                label = 'SEARCH';
54                page = 1;
55            }
56
57            return {
58                url: startUrl.url,
59                userData: {
60                    label,
61                    ...(page ? { page } : {}),
62                },
63            };
64        });
65    }
66};
67
68// Create router
69exports.createRouter = (globalContext) => {
70    return async function (routeName, requestContext) {
71        const route = routes[routeName];
72        if (!route) throw new Error(`No route for name: ${routeName}`);
73        log.debug(`Invoking route: ${routeName}`);
74        return route(requestContext, globalContext);
75    };
76};
77
78// Validate input
79exports.validateInput = (input) => {
80    const {
81        search,
82        startUrls,
83    } = input;
84
85    if (!search && !language && (!startUrls || startUrls.length === 0)) {
86        throw new Error('Actor must have at least one of the following attributes: starting url, search or language! Aborting.');
87    }
88};
89
90// Create proxy configuration
91exports.createProxyConfiguration = async ({
92    proxyConfig,
93    required = true,
94    force = Apify.isAtHome(),
95    blacklist = ['GOOGLESERP'],
96    hint = [],
97}) => {
98    const configuration = await Apify.createProxyConfiguration(proxyConfig);
99
100    // this works for custom proxyUrls
101    if (Apify.isAtHome() && required) {
102        if (!configuration || (!configuration.usesApifyProxy && (!configuration.proxyUrls || !configuration.proxyUrls.length)) || !configuration.newUrl()) {
103            throw new Error('\n=======\nYou must use Apify proxy or custom proxy URLs\n\n=======');
104        }
105    }
106
107    // check when running on the platform by default
108    if (force) {
109        // only when actually using Apify proxy it needs to be checked for the groups
110        if (configuration && configuration.usesApifyProxy) {
111            if (blacklist.some((blacklisted) => (configuration.groups || []).includes(blacklisted))) {
112                throw new Error(`\n=======\nThese proxy groups cannot be used in this actor. Choose other group or contact support@apify.com to give you proxy trial:\n\n*  ${blacklist.join('\n*  ')}\n\n=======`);
113            }
114
115            // specific non-automatic proxy groups like RESIDENTIAL, not an error, just a hint
116            if (hint.length && !hint.some((group) => (configuration.groups || []).includes(group))) {
117                Apify.utils.log.info(`\n=======\nYou can pick specific proxy groups for better experience:\n\n*  ${hint.join('\n*  ')}\n\n=======`);
118            }
119        }
120    }
121
122    return configuration;
123};
Developer
Community logoMaintained by Community
Actor metrics
  • 8 monthly users
  • 94.2% runs succeeded
  • Created in Feb 2020
  • Modified about 20 hours ago

You might also like these Actors