XMLs To Dataset

No credit card required

XMLs To Dataset

XMLs To Dataset

mtrunkat/xmls-to-dataset

No credit card required

Go to actor anytime you need to download XML files and store them in the dataset.

Dockerfile

1# This is a template for a Dockerfile used to run acts in Actor system. 2# The base image name below is set during the act build, based on user settings. 3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user 4FROM apify/actor-node 5 6# Second, copy just package.json and package-lock.json since it should be 7# the only file that affects "npm install" in the next step, to speed up the build 8COPY package*.json ./ 9 10# Install NPM packages, skip optional and development dependencies to 11# keep the image small. Avoid logging too much and print the dependency 12# tree for debugging 13RUN npm --quiet set progress=false \ 14 && npm install --only=prod --no-optional \ 15 && echo "Installed NPM packages:" \ 16 && (npm list --all || true) \ 17 && echo "Node.js version:" \ 18 && node --version \ 19 && echo "NPM version:" \ 20 && npm --version 21 22# Copy source code to container 23# Do this in the last step, to have fast build if only the source code changed 24COPY . ./ 25 26# NOTE: The CMD is already defined by the base image. 27# Uncomment this for local node inspector debugging: 28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ] 29

INPUT_SCHEMA.json

1{ 2 "title": "XMLS To Dataset input", 3 "description": "Enter the XML URLs you want to be downloaded.", 4 "type": "object", 5 "schemaVersion": 1, 6 "properties": { 7 "sources": { 8 "title": "URLs of XML files", 9 "type": "array", 10 "description": "Enter the XML URLs you want to be downloaded.", 11 "prefill": [ 12 { 13 "url": "https://www.w3schools.com/xml/plant_catalog.xml" 14 }, 15 { 16 "url": "https://www.w3schools.com/xml/cd_catalog.xml" 17 } 18 ], 19 "editor": "requestListSources" 20 }, 21 "proxy": { 22 "title": "Proxy configuration", 23 "type": "object", 24 "description": "Select proxies to be used to download XML files.", 25 "prefill": { "useApifyProxy": true }, 26 "editor": "proxy" 27 } 28 }, 29 "required": ["sources", "proxy"] 30}

main.js

1const Apify = require('apify'); 2const util = require('util') 3const parseString = require('xml2js').parseString; 4const _ = require('underscore'); 5 6const parseStringPromised = util.promisify(parseString); 7 8Apify.main(async () => { 9 const { 10 sources, 11 proxy, 12 } = await Apify.getValue('INPUT'); 13 14 const proxyConfiguration = await Apify.createProxyConfiguration(proxy); 15 const requestList = await Apify.openRequestList('urls', sources); 16 17 const crawler = new Apify.BasicCrawler({ 18 requestList, 19 20 handleRequestFunction: async ({ request }) => { 21 const { body, statusCode } = await Apify.utils.requestAsBrowser({ 22 url: request.url, 23 proxyUrl: proxyConfiguration.newUrl(), 24 }); 25 26 if (statusCode >= 300) throw new Error(`Request failed with statusCode=${statusCode}`); 27 28 await Apify.pushData({ 29 data: await parseStringPromised(body), 30 request, 31 }); 32 }, 33 34 handleFailedRequestFunction: async ({ request }) => { 35 await Apify.pushData({ 36 failed: true, 37 request, 38 }); 39 }, 40 }); 41 42 await crawler.run(); 43});

package.json

1{ 2 "name": "apify-project", 3 "version": "0.0.1", 4 "description": "", 5 "author": "It's not you it's me", 6 "license": "ISC", 7 "dependencies": { 8 "apify": "latest", 9 "xml2js": "latest", 10 "underscore": "latest" 11 }, 12 "scripts": { 13 "start": "node main.js" 14 } 15}
Developer
Maintained by Community
Actor stats
  • 66 users
  • 2.4k runs
  • Modified over 1 year ago
Categories

You might also like these Actors