1
2import { Actor } from 'apify';
3
4import axios from 'axios';
5
6import * as cheerio from 'cheerio';
7
8import https from 'https';
9
10
11await Actor.init();
12
13
14const input = await Actor.getInput();
15const {
16 url,
17 extractPosts = true,
18 extractPages = true,
19 extractMedia = true,
20 extractMetadata = true,
21 maxPages = 0,
22 includeComments = false,
23 postContainerSelector = 'article, .post, .entry, .type-post',
24 titleSelector = 'h1, .entry-title, .post-title, .wp-block-post-title',
25 contentSelector = '.entry-content, .post-content, .content, .wp-block-post-content'
26} = input;
27
28
29const selectors = {
30 postContainer: postContainerSelector,
31 titleSelector: titleSelector,
32 contentSelector: contentSelector
33};
34
35console.log('Starting WordPress content extraction from:', url);
36
37
38const axiosConfig = {
39 headers: {
40 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
41 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
42 'Accept-Language': 'en-US,en;q=0.5',
43 'Accept-Encoding': 'gzip, deflate',
44 'Connection': 'keep-alive',
45 'Upgrade-Insecure-Requests': '1'
46 },
47 timeout: 30000,
48 httpsAgent: new https.Agent({
49 rejectUnauthorized: false
50 })
51};
52
53
54async function extractPageContent(pageUrl) {
55 console.log('Extracting content from:', pageUrl);
56
57 try {
58 const response = await axios.get(pageUrl, axiosConfig);
59const $ = cheerio.load(response.data);
60
61 const pageData = {
62 url: pageUrl,
63 title: '',
64 content: '',
65 excerpt: '',
66 metadata: {},
67 media: [],
68 comments: [],
69 publishedDate: '',
70 author: '',
71 categories: [],
72 tags: [],
73 type: 'page'
74 };
75
76
77 const titleElement = $(selectors.titleSelector).first();
78 if (titleElement.length) {
79 pageData.title = titleElement.text().trim();
80 }
81
82
83 const contentElement = $(selectors.contentSelector).first();
84 if (contentElement.length) {
85 pageData.content = contentElement.html() || contentElement.text().trim();
86 }
87
88
89 const excerptElement = $('.entry-summary, .post-excerpt, .excerpt').first();
90 if (excerptElement.length) {
91 pageData.excerpt = excerptElement.text().trim();
92 }
93
94
95 if (extractMetadata) {
96 pageData.metadata = {
97 description: $('meta[name="description"]').attr('content') || '',
98 keywords: $('meta[name="keywords"]').attr('content') || '',
99 ogTitle: $('meta[property="og:title"]').attr('content') || '',
100 ogDescription: $('meta[property="og:description"]').attr('content') || '',
101 ogImage: $('meta[property="og:image"]').attr('content') || '',
102 twitterTitle: $('meta[name="twitter:title"]').attr('content') || '',
103 twitterDescription: $('meta[name="twitter:description"]').attr('content') || '',
104 canonical: $('link[rel="canonical"]').attr('href') || ''
105 };
106 }
107
108
109 if (extractMedia) {
110 const images = [];
111 $('img').each((i, img) => {
112 const src = $(img).attr('src');
113 const alt = $(img).attr('alt') || '';
114 if (src) {
115 images.push({ src, alt, type: 'image' });
116 }
117 });
118
119 const videos = [];
120 $('video, iframe[src*="youtube"], iframe[src*="vimeo"]').each((i, video) => {
121 const src = $(video).attr('src');
122 if (src) {
123 videos.push({ src, type: 'video' });
124 }
125 });
126
127 pageData.media = [...images, ...videos];
128 }
129
130
131 if (includeComments) {
132 $('.comment, .wp-comment').each((i, comment) => {
133 const commentData = {
134 author: $(comment).find('.comment-author, .author').text().trim(),
135 content: $(comment).find('.comment-content, .content').text().trim(),
136 date: $(comment).find('.comment-date, .date').text().trim()
137 };
138 if (commentData.content) {
139 pageData.comments.push(commentData);
140 }
141 });
142 }
143
144
145 const dateElement = $('.entry-date, .post-date, .published, time[datetime]').first();
146 if (dateElement.length) {
147 pageData.publishedDate = dateElement.attr('datetime') || dateElement.text().trim();
148 }
149
150
151 const authorElement = $('.author, .by-author, .entry-author').first();
152 if (authorElement.length) {
153 pageData.author = authorElement.text().trim();
154 }
155
156
157 $('.cat-links, .categories a, .entry-categories a').each((i, cat) => {
158 const category = $(cat).text().trim();
159 if (category) {
160 pageData.categories.push(category);
161 }
162 });
163
164
165 $('.tag-links, .tags a, .entry-tags a').each((i, tag) => {
166 const tagText = $(tag).text().trim();
167 if (tagText) {
168 pageData.tags.push(tagText);
169 }
170 });
171
172
173 if ($('body').hasClass('single-post') || $('body').hasClass('blog') || pageUrl.includes('/blog/') || pageUrl.includes('/post/')) {
174 pageData.type = 'post';
175 }
176
177 return pageData;
178
179 } catch (error) {
180 console.error('Error extracting content from', pageUrl, ':', error.message);
181 return null;
182 }
183}
184
185
186async function discoverWordPressContent(baseUrl) {
187 console.log('Discovering WordPress content...');
188
189 const discoveredUrls = new Set();
190 const urlsToProcess = [baseUrl];
191
192 try {
193 const response = await axios.get(baseUrl, axiosConfig);
194 const $ = cheerio.load(response.data);
195
196
197 const navigationSelectors = [
198 'nav a', '.main-navigation a', '.menu a', '.wp-block-navigation a',
199 '.pagination a', '.page-numbers a', '.nav-links a',
200 'a[href*="/page/"]', 'a[href*="/category/"]', 'a[href*="/tag/"]',
201 'a[href*="/author/"]', 'a[href*="/blog/"]', 'a[href*="/posts/"]'
202 ];
203
204 for (const selector of navigationSelectors) {
205 $(selector).each((i, element) => {
206 const href = $(element).attr('href');
207 if (href && href.startsWith(baseUrl)) {
208 if (!discoveredUrls.has(href) && urlsToProcess.length < (maxPages || 100)) {
209 discoveredUrls.add(href);
210 urlsToProcess.push(href);
211 }
212 }
213 });
214 }
215
216
217 const wpApiUrls = [
218 `${baseUrl}/wp-json/wp/v2/posts`,
219 `${baseUrl}/wp-json/wp/v2/pages`,
220 `${baseUrl}/wp-json/wp/v2/categories`,
221 `${baseUrl}/wp-json/wp/v2/tags`
222 ];
223
224 for (const apiUrl of wpApiUrls) {
225 try {
226 const response = await axios.get(apiUrl, { timeout: 10000 });
227 if (response.data && Array.isArray(response.data)) {
228 response.data.forEach(item => {
229 if (item.link && !discoveredUrls.has(item.link)) {
230 discoveredUrls.add(item.link);
231 urlsToProcess.push(item.link);
232 }
233 });
234 }
235 } catch (apiError) {
236
237 }
238 }
239
240 } catch (error) {
241 console.error('Error discovering content:', error.message);
242 }
243
244 return Array.from(discoveredUrls);
245}
246
247
248try {
249 const allUrls = await discoverWordPressContent(url);
250 console.log(`Found ${allUrls.length} URLs to process`);
251
252 const extractedContent = [];
253 let processedCount = 0;
254
255 for (const pageUrl of allUrls) {
256 if (maxPages > 0 && processedCount >= maxPages) {
257 console.log(`Reached maximum pages limit: ${maxPages}`);
258 break;
259 }
260
261 const content = await extractPageContent(pageUrl);
262 if (content && content.title) {
263 extractedContent.push(content);
264 console.log(`Extracted: ${content.title} (${content.type})`);
265 }
266
267 processedCount++;
268
269
270 await new Promise(resolve => setTimeout(resolve, 1000));
271 }
272
273
274 for (const content of extractedContent) {
275 await Actor.pushData(content);
276 }
277
278 console.log(`Successfully extracted ${extractedContent.length} pages/posts`);
279
280} catch (error) {
281 console.error('Error during extraction:', error);
282 await Actor.fail(error);
283}
284
285
286await Actor.exit();