-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscript.js
More file actions
43 lines (34 loc) · 1.58 KB
/
script.js
File metadata and controls
43 lines (34 loc) · 1.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
const { chromium } = require('playwright');
(async () => {
const browser = await chromium.launch();
const page = await browser.newPage();
try {
// Increase navigation timeout and use 'domcontentloaded' which is often faster
await page.goto('https://orcid.org/0009-0004-4273-3365', { waitUntil: 'domcontentloaded', timeout: 30000 });
// Wait for the container of the works list, which is more stable.
await page.waitForSelector('#works-container', { timeout: 30000 });
// A brief extra wait can sometimes help with tricky dynamic content
await page.waitForTimeout(2000);
const works = await page.$$eval('#works-container cy-list-item', (nodes) => {
return nodes.map(node => {
const titleElement = node.querySelector('cy-list-item-title a');
const doiElement = node.querySelector('a[href*="doi.org"]');
const title = titleElement ? titleElement.innerText.trim() : null;
// Extract DOI from the href attribute
const doi = doiElement ? doiElement.href.replace('https://doi.org/', '') : null;
return { title, doi };
}).filter(work => work.title && work.doi);
});
if (works.length === 0) {
console.error('No works with DOIs were found. The page structure might have changed.');
// For debugging, let's see the HTML content if no works are found
// console.log(await page.content());
} else {
console.log(JSON.stringify(works, null, 2));
}
} catch (error) {
console.error('Error scraping ORCID page:', error);
} finally {
await browser.close();
}
})();