Actor picture

Free eBook Scraper

tugkan/gutenberg-scraper

Scrape free eBooks from Project Gutenberg and download eBooks in all formats. Search for keywords and pick a language.

No credit card required

Author's avatarTuğkan Cengiz
  • Modified
  • Users29
  • Runs51
Actor picture

Free eBook Scraper

.DS_Store

Bud1�1Scompsrclg1ScompD�srcmoDDblob��$��AsrcmodDblob��$��Asrcph1Scomp` @� @� @� @E�DSDB ` @� @� @

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf
# editorconfig-tools is unable to ignore longs strings or urls
max_line_length = null

.eslintrc

{
  "extends": "@apify"
}

.gitignore

apify_storage
node_modules

.prettierrc.js

module.exports = {
    printWidth: 80,
    singleQuote: true,
    trailingComma: 'all',
};

Dockerfile

# Dockerfile contains instructions how to build a Docker image that will contain
# all the code and configuration needed to run your actor. For a full
# Dockerfile reference, see https://docs.docker.com/engine/reference/builder/

# First, specify the base Docker image. Apify provides the following base images
# for your convenience:
#  apify/actor-node-basic (Node.js 10 on Alpine Linux, small and fast image)
#  apify/actor-node-chrome (Node.js 10 + Chrome on Debian)
#  apify/actor-node-chrome-xvfb (Node.js 10 + Chrome + Xvfb on Debian)
# For more information, see https://apify.com/docs/actor#base-images
# Note that you can use any other image from Docker Hub.
FROM apify/actor-node-basic

# Second, copy just package.json and package-lock.json since they are the only files
# that affect NPM install in the next step
COPY package*.json ./

# Install NPM packages, skip optional and development dependencies to keep the
# image small. Avoid logging too much and print the dependency tree for debugging
RUN npm --quiet set progress=false \
 && npm install --only=prod --no-optional \
 && echo "Installed NPM packages:" \
 && npm list \
 && echo "Node.js version:" \
 && node --version \
 && echo "NPM version:" \
 && npm --version

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for simple source file changes.
COPY . ./

# Specify how to run the source code
CMD npm start

INPUT_SCHEMA.json


{
    "title": "Gutenberg.org Scraper",
    "description": "An actor that scrapes ebooks from gutenberg.org",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "search":{
            "title": "Search keyword",
            "type": "string",
            "description": "Keyword that you want to search in Gutenberg.org",
            "editor": "textfield"
        },
        "language":{
            "title": "Language",
            "type": "string",
            "description": "Select a language that Gutenberg.org provides",
            "editor": "select",
            "default": "none",
            "enum": ["none", "en","zh","da","nl","eo","fi","fr","de","el","hu","it","la","pt","es","sv","tl","af","ale","ar","arp","brx","br","bg","rmr","ca","ceb","cs","et","fa","fy","fur","gla","gl","kld","grc","he","is","ilo","ia","iu","ga","ja","csb","kha","ko","lt","mi","myn","enm","nah","nap","nav","nai","no","oc","oji","ang","pl","ro","ru","sa","sr","sl","bgs","te","cy"],
            "enumTitles": ["None", "English","Chinese","Danish","Dutch","Esperanto","Finnish","French","German","Greek","Hungarian","Italian","Latin","Portuguese","Spanish","Swedish","Tagalog","Afrikaans","Aleut","Arabic","Arapaho","Bodo","Breton","Bulgarian","Caló","Catalan","Cebuano","Czech","Estonian","Farsi","Frisian","Friulian","Gaelic, Scottish","Galician","Gamilaraay","Greek, Ancient","Hebrew","Icelandic","Iloko","Interlingua","Inuktitut","Irish","Japanese","Kashubian","Khasi","Korean","Lithuanian","Maori","Mayan Languages","Middle English","Nahuatl","Napoletano-Calabrese","Navajo","North American Indian","Norwegian","Occitan","Ojibwa","Old English","Polish","Romanian","Russian","Sanskrit","Serbian","Slovenian","Tagabawa","Telugu","Welsh","Yiddish"]
        },
        "startUrls": {
            "title": "Start URLs",
            "type": "array",
            "description": "URLs to start with. It should be category or product detail URLs",
            "prefill": [
                { "url": "https://www.gutenberg.org/browse/recent/last7" },
                { "url": "https://www.gutenberg.org/browse/titles/h" }
            ],
            "editor": "requestListSources"
        },
        "maxItems":{
            "title": "Max Items",
            "type": "integer",
            "description": "Maximum number of items you want on your results",
            "editor": "number"
        },
        "extendOutputFunction": {
            "title": "Extend output function",
            "type": "string",
            "nullable": true,
            "description": "Function that takes a JQuery handle ($) as argument and returns data that will be merged with the default output",
            "prefill": "($) => { return {} }",
            "editor": "javascript"
        },
        "proxyConfig": {
            "title": "Proxy configuration",
            "type": "object",
            "description": "Optionally use Apify Proxy",
            "prefill": { "useApifyProxy": true, "apifyProxyGroups": ["SHADER"] },
            "editor": "proxy"
        }
    },
    "required": ["proxyConfig"]
}

README.md

# Actor - Gutenberg.org Scraper

Gutenberg.org Scraper is an [Apify actor](https://apify.com/actors) for extracting data of ebooks from [Gutenberg.org](https://gutenberg.org). It allows you to search for keywords and pick a language. It is build on top of [Apify SDK](https://sdk.apify.com/) and you can run it both on [Apify platform](https://my.apify.com) and locally.

- [Gutenberg.org Scraper Input Parameters](#input-parameters)
- [Gutenberg.org Scraper Input Example](#input-example)
- [Gutenberg.org Scraper Ebook Output](#output)
- [Extend output function](#extend-output-function)
- [Compute Unit Consumption](#compute-unit-consumption)
- [During The Run](#during-the-run)
- [Gutenberg.org Export](#export)


## Gutenberg.org Scraper Input Parameters

The input of this scraper should be JSON containing the list of pages on Gutenberg that should be visited. Required fields are:

| Field | Type | Description |
| ----- | ---- | ----------- |
| search | String | (optional)  | The keyword that you want to search on Gutenberg |
| language | Array | (optional) List of languages that Gutenberg provides. You can fetch all ebooks of a language with it |
| startUrls | Array | (optional) List of Gutenberg URLs. You should provide ony "search" or "browse" URLs |
| maxItems | Integer | (optional) Maximum number of items that output will contain |
| extendOutputFunction | string | Function that takes a JQuery handle ($) as argument and returns data that will be merged with the default output. More information in [Extend output function](#extend-output-function) |
| proxyConfig | Object | Proxy configuration |

This solution requires the use of **Proxy servers**, either your own proxy servers or you can use <a href="https://www.apify.com/docs/proxy">Apify Proxy</a>.


### Gutenberg Scraper Input example
```json
{
	"proxyConfig":{"useApifyProxy": true},
	"startUrls":   [
		{ "url": "https://www.gutenberg.org/browse/recent/last7" },
    { "url": "https://www.gutenberg.org/browse/titles/h" }
	]
}

```

## Gutenberg Ebook Output
The structure of each item in Gutenberg ebooks looks like this:
```json
{
  "author": "United States. National Park Service",
  "title": "Cumberland Island: Junior Ranger Program Activity Guide for Ages 5-7",
  "language": "English",
  "htmlURL": "https://www.gutenberg.org/files/61452/61452-h/61452-h.htm",
  "epubURL": "https://www.gutenberg.org/ebooks/61452.epub.images?session_id=24e44a13d40847bb8d8b13a9216689880a3221cf",
  "kindleURL": "https://www.gutenberg.org/ebooks/61452.kindle.images?session_id=24e44a13d40847bb8d8b13a9216689880a3221cf",
  "plainTextURL": "https://www.gutenberg.org/files/61452/61452-0.txt"
}
```

### Extend output function

You can use this function to update the default output of this actor. This function gets a JQuery handle `$` as an argument so you can choose what data from the page you want to scrape. The output from this will function will get merged with the default output.

The return value of this function has to be an object!

You can return fields to achive 3 different things:
- Add a new field - Return object with a field that is not in the default output
- Change a field - Return an existing field with a new value
- Remove a field - Return an existing field with a value `undefined`


### Compute Unit Consumption
The actor optimized to run blazing fast and scrape many product as possible. Therefore, it forefronts all product detail requests. If actor doesn't block very often it'll scrape ~250 products in 3 minutes with 0.0235 compute units.

## During the Run

During the run, the actor will output messages letting you know what is going on. Each message always contains a short label specifying which page from the provided list is currently specified.
When items are loaded from the page, you should see a message about this event with a loaded item count and total item count for each page.

If you provide incorrect input to the actor, it will immediately stop with failure state and output an explanation of what is wrong.

## Gutenberg Export

During the run, the actor stores results into a dataset. Each item is a separate item in the dataset.

You can manage the results in any languague (Python, PHP, Node JS/NPM). See the FAQ or <a href="https://www.apify.com/docs/api" target="blank">our API reference</a> to learn more about getting results from this Gutenberg actor.

apify.json

{
	"name": "gutenberg-scraper",
	"version": "0.0",
	"buildTag": "latest",
	"template": "cheerio_crawler"
}

package-lock.json

This file is 3819 lines long. Only the first 50 are shown. Show all

{
	"name": "apify-gutenberg-actor",
	"version": "0.0.1",
	"lockfileVersion": 1,
	"requires": true,
	"dependencies": {
		"@apify/eslint-config": {
			"version": "0.0.3",
			"resolved": "https://registry.npmjs.org/@apify/eslint-config/-/eslint-config-0.0.3.tgz",
			"integrity": "sha512-WbjC0Xv1bEWN9DcOayv5y4Zygv4N8zPq/XZQygHfq+As+P6sxK5sSrAQzHIHWd+9jNX4TI9iJDcPqFzxsAxTgw==",
			"dev": true,
			"requires": {
				"eslint-config-airbnb": "^17.1.1",
				"eslint-config-airbnb-base": "^13.2.0",
				"eslint-plugin-import": "^2.18.2",
				"eslint-plugin-jsx-a11y": "^6.2.3",
				"eslint-plugin-promise": "^4.2.1",
				"eslint-plugin-react": "^7.14.3"
			}
		},
		"@apify/http-request": {
			"version": "1.1.5",
			"resolved": "https://registry.npmjs.org/@apify/http-request/-/http-request-1.1.5.tgz",
			"integrity": "sha512-wnT95dZKMbSl3K/K4hdDEnzGt/9cmf216jp3S3yDrAzlIA4eB3kDanI4gClaZ+yOsY3BRe8WNUPMIeCgWfksqQ==",
			"requires": {
				"apify-shared": "^0.1.67",
				"got": "^9.6.0",
				"proxy-agent": "^3.1.1"
			}
		},
		"@apify/ps-tree": {
			"version": "1.1.3",
			"resolved": "https://registry.npmjs.org/@apify/ps-tree/-/ps-tree-1.1.3.tgz",
			"integrity": "sha512-+hIr8EaTRd9fsOiNNzf1Fi8Tm9qs8cdPBZjuq5fXDV6SOCdi2ZyQlcQSzc8lY0hb+UhBib1WPixtCdKLL169WA==",
			"requires": {
				"event-stream": "3.3.4"
			}
		},
		"@babel/code-frame": {
			"version": "7.8.3",
			"resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.8.3.tgz",
			"integrity": "sha512-a9gxpmdXtZEInkCSHUJDLHZVBgb1QS0jhss4cPP93EW7s+uC5bikET2twEF3KV+7rDblJcmNvTR7VJejqd2C2g==",
			"dev": true,
			"requires": {
				"@babel/highlight": "^7.8.3"
			}
		},
		"@babel/highlight": {
			"version": "7.8.3",
			"resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.8.3.tgz",

package.json

{
	"name": "gutenberg-scraper",
	"version": "0.0.1",
	"description": "Gutenberg.org ebooks scraper.",
	"dependencies": {
		"apify": "^0.19.0"
	},
	"devDependencies": {
		"@apify/eslint-config": "0.0.3",
		"eslint": "^6.7.2"
	},
	"scripts": {
		"start": "node src/main.js"
	},
	"author": "Tugkan Cengiz",
	"license": "ISC"
}

src/extractors.js

// Fetch offers from browse list page
const fetchEbooksFromBrowse = ($) => {
    return $('a[href]')
        .filter((i, el) => $(el).attr('href').match(/ebooks\/[0-9]/g))
        .map((i, el) => `https://www.gutenberg.org${$(el).attr('href')}`)
        .get();
};

// Fetch offers from search list page
const fetchEbooksFromSearch = ($) => {
    return $('.booklink a[href]')
        .filter((i, el) => $(el).attr('href').match(/ebooks\/[0-9]/g))
        .map((i, el) => `https://www.gutenberg.org${$(el).attr('href')}`)
        .get();
};

// Fetch ebook defaults
const fetchEbook = ($) => {
    return {
        author: $('a[itemprop="creator"]').text().trim(),
        title: $('h1[itemprop="name"]').text().trim(),
        language: $('tr[itemprop="inLanguage"]').text().trim().replace(/Language/, '')
            .trim(),
        htmlURL: $('.files td[property="dcterms:format"]')
            ? `https://www.gutenberg.org${$('.files td[property="dcterms:format"]')
                .filter((i, el) => $(el).attr('content').includes('text/html'))
                .find('a')
                .attr('href')}` : null,
        epubURL: $('.files td[content="application/epub+zip"] a').attr('href')
            ? `https://www.gutenberg.org${$('.files td[content="application/epub+zip"] a').attr('href')}`
            : null,
        kindleURL: $('.files td[content="application/x-mobipocket-ebook"] a').attr('href')
            ? `https://www.gutenberg.org${$('.files td[content="application/x-mobipocket-ebook"] a').attr('href')}`
            : null,
        plainTextURL: $('.files td[content="text/plain; charset=utf-8"] a').attr('href')
            ? `https://www.gutenberg.org${$('.files td[content="text/plain; charset=utf-8"] a').attr('href')}`
            : null,
    };
};

module.exports = {
    fetchEbooksFromBrowse,
    fetchEbooksFromSearch,
    fetchEbook,
};

src/main.js

const Apify = require('apify');
const tools = require('./tools');

const {
    utils: { log },
} = Apify;

// Create crawler
Apify.main(async () => {
    log.info('PHASE -- STARTING ACTOR.');

    global.userInput = await Apify.getInput();
    const { search, language, startUrls, proxyConfig } = global.userInput;
    log.info('ACTOR OPTIONS: -- ', global.userInput);

    // Input validation
    if (!proxyConfig) {
        throw new Error('Actor must use proxy! Aborting.');
    }

    if (!search && !language && (!startUrls || startUrls.length === 0)) {
        throw new Error('Actor must have at least one of the following attributes: starting url, search or language! Aborting.');
    }

    // Create request queue
    const requestQueue = await Apify.openRequestQueue();

    // Initialize first request
    const pages = await tools.getSources();
    for (const page of pages) {
        await requestQueue.addRequest({ ...page });
    }

    // Create route
    const router = tools.createRouter({ requestQueue });


    log.info('PHASE -- SETTING UP CRAWLER.');
    const crawler = new Apify.CheerioCrawler({
        requestQueue,
        handlePageTimeoutSecs: 120,
        requestTimeoutSecs: 120,
        ignoreSslErrors: true,
        ...proxyConfig,
        useSessionPool: true,
        handlePageFunction: async (context) => {
            const { $, request, response, session } = context;
            log.debug(`CRAWLER -- Processing ${request.url}`);

            const isBlocked = $('h2').text().includes('How can I get unblocked?');

            // Status code check
            if (!response || response.statusCode !== 200 || isBlocked) {
                session.markBad();
                throw new Error(`We got blocked by target on ${request.url}`);
            } else {
                session.markGood();
            }

            // Redirect to route
            await router(request.userData.label, context);
        },
    });

    log.info('PHASE -- STARTING CRAWLER.');

    await crawler.run();

    log.info('PHASE -- ACTOR FINISHED.');
});

src/routes.js

This file is 103 lines long. Only the first 50 are shown. Show all

// routes.js
const Apify = require('apify');
const extractors = require('./extractors');

const {
    utils: { log },
} = Apify;


// ItemCount
let itemCount = 0;

// BROWSE page crawler.
// Fetches the ebooks from the list
exports.BROWSE = async ({ $, request }, { requestQueue }) => {
    log.info(`CRAWLER -- Browse: Fetching ebooks from the url: ${request.url}`);


    // Check for error
    if ($('html').html().includes('There are too many')) {
        log.error(`There too many books on website. It doesn't show up: ${request.url}`);
        process.exit(0);
    }

    // Fetch all current ebooks listed
    const ebooks = extractors.fetchEbooksFromBrowse($);

    // Add them to request queue
    for (const ebook of ebooks) {
        await requestQueue.addRequest(
            {
                url: ebook,
                userData: {
                    label: 'EBOOK',
                },
            },
            { forefront: true },
        );
    }

    log.debug(`CRAWLER -- ${ebooks.length} ebooks added to queue`);
};

// SEARCH page crawler.
// Fetches the ebooks from search page and
// Adds next page to request queue
exports.SEARCH = async ({ $, request }, { requestQueue }) => {
    const { page, baseURL } = request.userData;
    log.info(`CRAWLER -- Search: Fetching ebooks from the url: ${request.url} page: ${page}`);

src/tools.js

const Apify = require('apify');
const routes = require('./routes');

const {
    utils: { log },
} = Apify;

// Retrieves sources and returns object for request list
exports.getSources = async () => {
    log.debug('Getting sources');

    // Get user input
    const { search, language, startUrls } = global.userInput;

    // Build start URLs
    if (language && language !== 'none') {
        return [{
            url: `https://www.gutenberg.org/browse/languages/${language}`,
            userData: {
                label: 'BROWSE',
                baseURL: `https://www.gutenberg.org/browse/languages/${language}`,
            },
        }];
    }

    if (search) {
        return [{
            url: `https://www.gutenberg.org/ebooks/search/?query=${search.toLowerCase().replace(' ', '+')}`,
            userData: {
                label: 'SEARCH',
                page: 1,
                baseURL: `https://www.gutenberg.org/ebooks/search/?query=${search.toLowerCase().replace(' ', '+')}`,
            },
        }];
    }

    if (startUrls && startUrls.length > 0) {
        return startUrls.map((startUrl) => {
            let label = '';
            let page = null;

            // Check route dependent on url
            if (startUrl.url.match(/\/ebooks\/[0-9]/g)) {
                label = 'EBOOK';
            }

            if (startUrl.url.match(/\/browse\//g)) {
                label = 'BROWSE';
                page = 1;
            }

            if (startUrl.url.match(/ebooks\/search/g)) {
                label = 'SEARCH';
                page = 1;
            }

            return {
                url: startUrl.url,
                userData: {
                    label,
                    ...(page ? { page } : {}),
                },
            };
        });
    }
};

// Create router
exports.createRouter = (globalContext) => {
    return async function (routeName, requestContext) {
        const route = routes[routeName];
        if (!route) throw new Error(`No route for name: ${routeName}`);
        log.debug(`Invoking route: ${routeName}`);
        return route(requestContext, globalContext);
    };
};