CSV File to Dataset avatar
CSV File to Dataset

Pricing

Pay per usage

Go to Store
CSV File to Dataset

CSV File to Dataset

Developed by

Lukáš Křivka

Lukáš Křivka

Maintained by Community

Upload a local or remote CSV/text file and convert it to Apify Dataset for further use.

0.0 (0)

Pricing

Pay per usage

5

Total users

93

Monthly users

11

Runs succeeded

>99%

Last modified

8 months ago

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:18
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./
# Run the image.
CMD npm start --silent

.actor/actor.json

{
"actorSpecification": 1,
"name": "my-actor-25",
"title": "Project Cheerio Crawler Javascript",
"description": "Crawlee and Cheerio project in javascript.",
"version": "0.0",
"meta": {
"templateId": "js-crawlee-cheerio"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"title": "PlaywrightCrawler Template",
"type": "object",
"schemaVersion": 1,
"properties": {
"csvUrls": {
"title": "Upload or link a CSV or text file",
"type": "array",
"description": "Upload or link a CSV with data",
"editor": "requestListSources"
},
"separator": {
"title": "Column separator",
"type": "string",
"default": ",",
"description": "Usually `,` or `;`",
"editor": "textfield"
}
}
}

src/main.js

1import { Actor, log } from 'apify';
2
3import { gotScraping } from 'got-scraping';
4import neatCsv from 'neat-csv';
5// Initialize the Apify SDK
6await Actor.init();
7
8const { csvUrls, separator = ',' } = await Actor.getValue('INPUT');
9
10const urls = csvUrls.map((req) => req?.url || req?.requestsFromUrl).filter(Boolean);
11
12await Actor.setStatusMessage(`Received ${urls.length} CSV URLs. Starting download.`);
13
14for (const url of urls) {
15 const { body } = await gotScraping(url);
16 let data;
17 try {
18 data = await neatCsv(body.toString(), { separator });
19 } catch (e) {
20 await Actor.fail(`Could not convert file to CSV with error: ${e}`)
21 }
22 await Actor.setStatusMessage(`Received ${data.length} rows from ${url}. Starting to push to the dataset, this might take a while.`);
23 await Actor.pushData(data);
24}
25
26await Actor.exit(`CSV succefully converted to a dataset with ID: ${Actor.getEnv().defaultDatasetId}`);

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
"extends": "@apify",
"root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
dist
node_modules
apify_storage
storage

package.json

{
"name": "crawlee-cheerio-javascript",
"version": "0.0.1",
"type": "module",
"description": "This is a boilerplate of an Apify actor.",
"engines": {
"node": ">=18.0.0"
},
"dependencies": {
"apify": "^3.1.10",
"crawlee": "^3.5.4",
"neat-csv": "^7.0.0"
},
"devDependencies": {
"@apify/eslint-config": "^0.4.0",
"eslint": "^8.50.0"
},
"scripts": {
"start": "node src/main.js",
"lint": "eslint ./src --ext .js,.jsx",
"lint:fix": "eslint ./src --ext .js,.jsx --fix",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
},
"author": "It's not you it's me",
"license": "ISC"
}