Petition.lu signature extractor.

No credit card required

Petition.lu signature extractor.

Petition.lu signature extractor.

thibault/extract-signatures-chambre-depute-luxembourg

No credit card required

This Actor let you extract signatures from Luxembourgish official petition website petitions.lu. You'll get firstname(s), lastname(s), and city.

Dockerfile

1# First, specify the base Docker image. You can read more about 2# the available images at https://sdk.apify.com/docs/guides/docker-images 3# You can also use any other image from Docker Hub. 4FROM apify/actor-node:16 5 6# Second, copy just package.json and package-lock.json since those are the only 7# files that affect "npm install" in the next step, to speed up the build. 8COPY package*.json ./ 9 10# Install NPM packages, skip optional and development dependencies to 11# keep the image small. Avoid logging too much and print the dependency 12# tree for debugging 13RUN npm --quiet set progress=false \ 14 && npm install --only=prod --no-optional \ 15 && echo "Installed NPM packages:" \ 16 && (npm list || true) \ 17 && echo "Node.js version:" \ 18 && node --version \ 19 && echo "NPM version:" \ 20 && npm --version 21 22# Next, copy the remaining files and directories with the source code. 23# Since we do this after NPM install, quick build will be really fast 24# for most source file changes. 25COPY . ./ 26 27# Optionally, specify how to launch the source code of your actor. 28# By default, Apify's base Docker images define the CMD instruction 29# that runs the Node.js source code using the command specified 30# in the "scripts.start" section of the package.json file. 31# In short, the instruction looks something like this: 32# 33# CMD npm start 34

INPUT_SCHEMA.json

1{ 2 "title": "Petition Crawling Input Schema", 3 "type": "object", 4 "schemaVersion": 1, 5 "properties": { 6 "petitionUrl": { 7 "title": "Petition URL (it should look like this https://www.petitions.lu/voir-les-signatures/xxxx-xxxx)", 8 "type": "string", 9 "nullable": false, 10 "description": "Please paste here the URL of the petition you want to crawl signature from.", 11 "editor": "textfield" 12 } 13 } 14}

main.js

1// This is the main Node.js source code file of your actor. 2// It is referenced from the "scripts" section of the package.json file. 3 4const Apify = require('apify'); 5 6Apify.main(async () => { 7 // Get input of the actor. Input fields can be modified in INPUT_SCHEMA.json file. 8 // For more information, see https://docs.apify.com/actor/input-schema 9 const input = await Apify.getInput(); 10 console.log('Input:'); 11 console.dir(input); 12 13 // Here you can prepare your input for actor apify/web-scraper this input is based on a actor 14 // task you used as the starting point. 15 const metamorphInput = { 16 "runMode": "DEVELOPMENT", 17 "startUrls": [ 18 { 19 "url": input.petitionUrl, 20 "method": "GET" 21 } 22 ], 23 "keepUrlFragments": false, 24 "linkSelector": "a[href]", 25 "pseudoUrls": [ 26 { 27 "purl": input.petitionUrl+"?tx_petition_singlepetitionsignatures%5Bpagenumber%5D=[(\\d)+]&cHash=[(\\w)+]", 28 "method": "GET" 29 } 30 ], 31 "pageFunction": // The function accepts a single argument: the "context" object. 32 // For a complete list of its properties and functions, 33 // see https://apify.com/apify/web-scraper#page-function 34 async function pageFunction(context) { 35 // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT! 36 // debugger; 37 38 // jQuery is handy for finding DOM elements and extracting data from them. 39 // To use it, make sure to enable the "Inject jQuery" option. 40 const $ = context.jQuery; 41 const signatures = []; 42 $('.petition-signatures div p').each(function(){ 43 var obj = {}; 44 var split = $(this).text().split(','); 45 if (split.length == 2){ 46 obj.name = split[0]; 47 obj.city = split[1].trim(); 48 } else { 49 obj.name = split.subarray(0,split.length-2).join(" "); 50 obj.city = split[split.length-1].trim(); 51 } 52 signatures.push(obj); 53 }); 54 55 // Print some information to actor log 56 context.log.info(`URL: ${context.request.url}, SIGNATURES: ${signatures}`); 57 58 // Manually add a new page to the queue for scraping. 59 // context.enqueueRequest({ url: 'http://www.example.com' }); 60 61 // Return an object with the data extracted from the page. 62 // It will be stored to the resulting dataset. 63 return signatures; 64 }, 65 "preNavigationHooks": `// We need to return array of (possibly async) functions here. 66 // The functions accept two arguments: the "crawlingContext" object 67 // and "gotoOptions". 68 [ 69 async (crawlingContext, gotoOptions) => { 70 // ... 71 }, 72 ]`, 73 "postNavigationHooks": `// We need to return array of (possibly async) functions here. 74 // The functions accept a single argument: the "crawlingContext" object. 75 [ 76 async (crawlingContext) => { 77 // ... 78 }, 79 ]`, 80 "injectJQuery": true, 81 "injectUnderscore": false, 82 "proxyConfiguration": { 83 "useApifyProxy": false 84 }, 85 "proxyRotation": "RECOMMENDED", 86 "useChrome": false, 87 "useStealth": false, 88 "ignoreSslErrors": false, 89 "ignoreCorsAndCsp": false, 90 "downloadMedia": false, 91 "downloadCss": false, 92 "waitUntil": [ 93 "networkidle2" 94 ], 95 "breakpointLocation": "NONE", 96 "debugLog": false, 97 "browserLog": false 98 }; 99 100 // Now let's metamorph into actor apify/web-scraper using the created input. 101 await Apify.metamorph('apify/web-scraper', metamorphInput); 102});

package.json

1{ 2 "name": "extract-signatures-chambre-depute-luxembourg", 3 "version": "0.0.1", 4 "dependencies": { 5 "apify": "^2.1.0" 6 }, 7 "scripts": { 8 "start": "node main.js" 9 }, 10 "author": "Thibault Milan" 11} 12
Developer
Maintained by Community
Actor stats
  • 3 users
  • 319 runs
  • Modified about 1 year ago

You might also like these Actors