Petition.lu signature extractor.
Try for free
No credit card required
View all Actors
Petition.lu signature extractor.
thibault/extract-signatures-chambre-depute-luxembourg
Try for free
No credit card required
This Actor let you extract signatures from Luxembourgish official petition website petitions.lu. You'll get firstname(s), lastname(s), and city.
Dockerfile
1# First, specify the base Docker image. You can read more about
2# the available images at https://sdk.apify.com/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:16
5
6# Second, copy just package.json and package-lock.json since those are the only
7# files that affect "npm install" in the next step, to speed up the build.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Next, copy the remaining files and directories with the source code.
23# Since we do this after NPM install, quick build will be really fast
24# for most source file changes.
25COPY . ./
26
27# Optionally, specify how to launch the source code of your actor.
28# By default, Apify's base Docker images define the CMD instruction
29# that runs the Node.js source code using the command specified
30# in the "scripts.start" section of the package.json file.
31# In short, the instruction looks something like this:
32#
33# CMD npm start
INPUT_SCHEMA.json
1{
2 "title": "Petition Crawling Input Schema",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "petitionUrl": {
7 "title": "Petition URL (it should look like this https://www.petitions.lu/voir-les-signatures/xxxx-xxxx)",
8 "type": "string",
9 "nullable": false,
10 "description": "Please paste here the URL of the petition you want to crawl signature from.",
11 "editor": "textfield"
12 }
13 }
14}
main.js
1// This is the main Node.js source code file of your actor.
2// It is referenced from the "scripts" section of the package.json file.
3
4const Apify = require('apify');
5
6Apify.main(async () => {
7 // Get input of the actor. Input fields can be modified in INPUT_SCHEMA.json file.
8 // For more information, see https://docs.apify.com/actor/input-schema
9 const input = await Apify.getInput();
10 console.log('Input:');
11 console.dir(input);
12
13 // Here you can prepare your input for actor apify/web-scraper this input is based on a actor
14 // task you used as the starting point.
15 const metamorphInput = {
16 "runMode": "DEVELOPMENT",
17 "startUrls": [
18 {
19 "url": input.petitionUrl,
20 "method": "GET"
21 }
22 ],
23 "keepUrlFragments": false,
24 "linkSelector": "a[href]",
25 "pseudoUrls": [
26 {
27 "purl": input.petitionUrl+"?tx_petition_singlepetitionsignatures%5Bpagenumber%5D=[(\\d)+]&cHash=[(\\w)+]",
28 "method": "GET"
29 }
30 ],
31 "pageFunction": // The function accepts a single argument: the "context" object.
32 // For a complete list of its properties and functions,
33 // see https://apify.com/apify/web-scraper#page-function
34 async function pageFunction(context) {
35 // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!
36 // debugger;
37
38 // jQuery is handy for finding DOM elements and extracting data from them.
39 // To use it, make sure to enable the "Inject jQuery" option.
40 const $ = context.jQuery;
41 const signatures = [];
42 $('.petition-signatures div p').each(function(){
43 var obj = {};
44 var split = $(this).text().split(',');
45 if (split.length == 2){
46 obj.name = split[0];
47 obj.city = split[1].trim();
48 } else {
49 obj.name = split.subarray(0,split.length-2).join(" ");
50 obj.city = split[split.length-1].trim();
51 }
52 signatures.push(obj);
53 });
54
55 // Print some information to actor log
56 context.log.info(`URL: ${context.request.url}, SIGNATURES: ${signatures}`);
57
58 // Manually add a new page to the queue for scraping.
59 // context.enqueueRequest({ url: 'http://www.example.com' });
60
61 // Return an object with the data extracted from the page.
62 // It will be stored to the resulting dataset.
63 return signatures;
64 },
65 "preNavigationHooks": `// We need to return array of (possibly async) functions here.
66 // The functions accept two arguments: the "crawlingContext" object
67 // and "gotoOptions".
68 [
69 async (crawlingContext, gotoOptions) => {
70 // ...
71 },
72 ]`,
73 "postNavigationHooks": `// We need to return array of (possibly async) functions here.
74 // The functions accept a single argument: the "crawlingContext" object.
75 [
76 async (crawlingContext) => {
77 // ...
78 },
79 ]`,
80 "injectJQuery": true,
81 "injectUnderscore": false,
82 "proxyConfiguration": {
83 "useApifyProxy": false
84 },
85 "proxyRotation": "RECOMMENDED",
86 "useChrome": false,
87 "useStealth": false,
88 "ignoreSslErrors": false,
89 "ignoreCorsAndCsp": false,
90 "downloadMedia": false,
91 "downloadCss": false,
92 "waitUntil": [
93 "networkidle2"
94 ],
95 "breakpointLocation": "NONE",
96 "debugLog": false,
97 "browserLog": false
98 };
99
100 // Now let's metamorph into actor apify/web-scraper using the created input.
101 await Apify.metamorph('apify/web-scraper', metamorphInput);
102});
package.json
1{
2 "name": "extract-signatures-chambre-depute-luxembourg",
3 "version": "0.0.1",
4 "dependencies": {
5 "apify": "^2.1.0"
6 },
7 "scripts": {
8 "start": "node main.js"
9 },
10 "author": "Thibault Milan"
11}
Developer
Maintained by Community
Actor Metrics
1 monthly user
-
1 star
Created in Nov 2021
Modified 2 years ago
Categories