Alert

Actor is under maintenance

This actor may be unreliable while under maintenance. Would you like to try a similar actor instead?

Actor picture

KL Lyrics Scrapper

azibcepe/lirik-kapanlagi-com

This is lyrics scrapers from indonesian lyrics site. After run and done this actor you may have a huge dataset of lyrics. Warning: Use it at your own risk

No credit card required

Author's avatarAris Riswanto
  • Modified
  • Users4
  • Runs325
Actor picture
KL Lyrics Scrapper

.editorconfig

root = true

[*]
indent_style = space
indent_size = 2
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
    "extends": "@apify"
}

.gitignore

# This file tells Git which files shouldn't be added to source control

.idea
node_modules

Dockerfile

# First, specify the base Docker image. You can read more about
# the available images at https://sdk.apify.com/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:16

# Second, copy just package.json and package-lock.json since it should be
# the only file that affects "npm install" in the next step, to speed up the build
COPY package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
 && npm install --only=prod --no-optional \
 && echo "Installed NPM packages:" \
 && (npm list || true) \
 && echo "Node.js version:" \
 && node --version \
 && echo "NPM version:" \
 && npm --version

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./

# Optionally, specify how to launch the source code of your actor.
# By default, Apify's base Docker images define the CMD instruction
# that runs the Node.js source code using the command specified
# in the "scripts.start" section of the package.json file.
# In short, the instruction looks something like this:
#
# CMD npm start

INPUT_SCHEMA.json

{
    "title": "Input schema for the apify_project actor.",
    "type": "object",
    "schemaVersion": 1,
    "properties": {},
    "required": []
}

README.md

# Lirik Scrapers

### Table of contents

Just use this and run.

<!-- toc start -->
- Introduction
  - Use it with your own risk
- Use Cases
  - Machine learning
  - Lyrics recognition
 <!-- toc end -->

apify.json

{
    "env": { "npm_config_loglevel": "silent" }
}

main.js

const Apify = require("apify");

const abjad = "abcdefghijklmnopqrstuvwxyz".split("");
const domain = "lirik.kapanlagi.com";
const storesName = domain.split(".").join("");
async function letsScrape() {
    const requestQueue = await Apify.openRequestQueue();
    for await (let lang of ["id", "en"]) {
        const lt = ["num", ...abjad].map((i) => `${i}_${lang}`);
        for await (let ls of lt) {
            await requestQueue.addRequest({ url: `https://${domain}/lagu/${ls}/` });
        }
    }
    const dataset = await Apify.openDataset();
    const pagingMatcher = new Apify.PseudoUrl(
        `https://${domain}/lagu/[.+]/index[\\d].html`
    );
    const cachedPagingUrl = {};
    const handlePageFunction = async ({ request, $ }) => {
        try {
            const url = new URL(request.url);
            if (
                url.pathname.startsWith("/artis") &&
                url.pathname.split("/").length === 5
            ) {
                const title = $("title").text();
                const lirik = [];
                $(".col-lirik.lyrics-body > span.lirik_line").each((_, item) =>
                    lirik.push($(item).text())
                );
                await dataset.pushData({
                    url: request.url,
                    title,
                    lirik,
                });
            } else {
                $(".col-lirik.pagination2 a").each(async (_, item) => {
                    const url = $(item).attr("href");
                    if (pagingMatcher.matches(url)) {
                        if (!cachedPagingUrl[url]) {
                            await requestQueue.addRequest({ url });
                            cachedPagingUrl[url] = true;
                        }
                    }
                });
                await Apify.utils.enqueueLinks({
                    $,
                    requestQueue,
                    selector: ".div-horizontal2-list.multiline.multiline-ahalf > a",
                    pseudoUrls: [`https://${domain}/artis/[.+]/[.+]/`],
                });
            }
        } catch (e) {
            /* Nope error */
        }
    };
    const crawler = new Apify.CheerioCrawler({
        requestQueue,
        handlePageFunction,
        //maxConcurrency: 20,
        //maxRequestsPerCrawl: 2
    });
    return crawler;
}

Apify.main(async () => {
    const crawler = await letsScrape();
    await crawler.run();
});

package.json

{
    "name": "project-empty",
    "version": "0.0.1",
    "description": "This is a boilerplate of an Apify actor.",
    "dependencies": {
        "apify": "^2.0.0"
    },
    "devDependencies": {
        "@apify/eslint-config": "^0.1.3",
        "eslint": "^7.0.0"
    },
    "scripts": {
        "start": "node main.js",
        "lint": "./node_modules/.bin/eslint ./src --ext .js,.jsx",
        "lint:fix": "./node_modules/.bin/eslint ./src --ext .js,.jsx --fix",
        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
    },
    "author": "It's not you it's me",
    "license": "ISC"
}