Petition.lu signature extractor. avatar
Petition.lu signature extractor.
Try for free

No credit card required

View all Actors
Petition.lu signature extractor.

Petition.lu signature extractor.

thibault/extract-signatures-chambre-depute-luxembourg
Try for free

No credit card required

This Actor let you extract signatures from Luxembourgish official petition website petitions.lu. You'll get firstname(s), lastname(s), and city.

Dockerfile

1# First, specify the base Docker image. You can read more about
2# the available images at https://sdk.apify.com/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:16
5
6# Second, copy just package.json and package-lock.json since those are the only
7# files that affect "npm install" in the next step, to speed up the build.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Next, copy the remaining files and directories with the source code.
23# Since we do this after NPM install, quick build will be really fast
24# for most source file changes.
25COPY . ./
26
27# Optionally, specify how to launch the source code of your actor.
28# By default, Apify's base Docker images define the CMD instruction
29# that runs the Node.js source code using the command specified
30# in the "scripts.start" section of the package.json file.
31# In short, the instruction looks something like this:
32#
33# CMD npm start

INPUT_SCHEMA.json

1{
2    "title": "Petition Crawling Input Schema",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "petitionUrl": {
7            "title": "Petition URL (it should look like this https://www.petitions.lu/voir-les-signatures/xxxx-xxxx)",
8            "type": "string",
9            "nullable": false,
10            "description": "Please paste here the URL of the petition you want to crawl signature from.",
11            "editor": "textfield"
12        }
13    }
14}

main.js

1// This is the main Node.js source code file of your actor.
2// It is referenced from the "scripts" section of the package.json file.
3
4const Apify = require('apify');
5
6Apify.main(async () => {
7    // Get input of the actor. Input fields can be modified in INPUT_SCHEMA.json file.
8    // For more information, see https://docs.apify.com/actor/input-schema
9    const input = await Apify.getInput();
10    console.log('Input:');
11    console.dir(input);
12
13    // Here you can prepare your input for actor apify/web-scraper this input is based on a actor
14    // task you used as the starting point.
15    const metamorphInput = {
16        "runMode": "DEVELOPMENT",
17        "startUrls": [
18            {
19                "url": input.petitionUrl,
20                "method": "GET"
21            }
22        ],
23        "keepUrlFragments": false,
24        "linkSelector": "a[href]",
25        "pseudoUrls": [
26            {
27                "purl": input.petitionUrl+"?tx_petition_singlepetitionsignatures%5Bpagenumber%5D=[(\\d)+]&cHash=[(\\w)+]",
28                "method": "GET"
29            }
30        ],
31        "pageFunction": // The function accepts a single argument: the "context" object.
32        // For a complete list of its properties and functions,
33        // see https://apify.com/apify/web-scraper#page-function 
34        async function pageFunction(context) {
35            // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!
36            // debugger; 
37        
38            // jQuery is handy for finding DOM elements and extracting data from them.
39            // To use it, make sure to enable the "Inject jQuery" option.
40            const $ = context.jQuery;
41            const signatures = [];
42             $('.petition-signatures div p').each(function(){
43                 var obj = {};
44                 var split = $(this).text().split(',');
45                 if (split.length == 2){
46                     obj.name = split[0];
47                     obj.city = split[1].trim();
48                 } else {
49                     obj.name = split.subarray(0,split.length-2).join(" ");
50                     obj.city = split[split.length-1].trim();
51                 }
52                 signatures.push(obj);
53             });
54        
55            // Print some information to actor log
56            context.log.info(`URL: ${context.request.url}, SIGNATURES: ${signatures}`);
57        
58            // Manually add a new page to the queue for scraping.
59            // context.enqueueRequest({ url: 'http://www.example.com' });
60        
61            // Return an object with the data extracted from the page.
62            // It will be stored to the resulting dataset.
63            return signatures;
64        },
65        "preNavigationHooks": `// We need to return array of (possibly async) functions here.
66            // The functions accept two arguments: the "crawlingContext" object
67            // and "gotoOptions".
68            [
69                async (crawlingContext, gotoOptions) => {
70                    // ...
71                },
72            ]`,
73        "postNavigationHooks": `// We need to return array of (possibly async) functions here.
74            // The functions accept a single argument: the "crawlingContext" object.
75            [
76                async (crawlingContext) => {
77                    // ...
78                },
79            ]`,
80        "injectJQuery": true,
81        "injectUnderscore": false,
82        "proxyConfiguration": {
83            "useApifyProxy": false
84        },
85        "proxyRotation": "RECOMMENDED",
86        "useChrome": false,
87        "useStealth": false,
88        "ignoreSslErrors": false,
89        "ignoreCorsAndCsp": false,
90        "downloadMedia": false,
91        "downloadCss": false,
92        "waitUntil": [
93            "networkidle2"
94        ],
95        "breakpointLocation": "NONE",
96        "debugLog": false,
97        "browserLog": false
98    };
99
100    // Now let's metamorph into actor apify/web-scraper using the created input.
101    await Apify.metamorph('apify/web-scraper', metamorphInput);
102});

package.json

1{
2    "name": "extract-signatures-chambre-depute-luxembourg",
3    "version": "0.0.1",
4    "dependencies": {
5        "apify": "^2.1.0"
6    },
7    "scripts": {
8        "start": "node main.js"
9    },
10    "author": "Thibault Milan"
11}
Developer
Maintained by Community
Actor metrics
  • 1 monthly user
  • 0 stars
  • Created in Nov 2021
  • Modified over 1 year ago