-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcomment_scrape.js
More file actions
89 lines (77 loc) · 2.9 KB
/
comment_scrape.js
File metadata and controls
89 lines (77 loc) · 2.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
//comment_scrape.js
/*
script that scrapes the comments from a youtube video
*/
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
const { spawn, exec } = require('child_process');
const fs = require('fs');
const fsp = require('fs').promises;
const { executablePath } = require('puppeteer')
puppeteer.use(StealthPlugin());
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function run() {
if (!fs.existsSync('transcript_scrape/comments.csv')){
fs.writeFile('transcript_scrape/comments.csv', '', function () { console.log('') })
}
if (!fs.existsSync('output/com_youtubei_v1_next/data.json')){
fs.writeFile('output/com_youtubei_v1_next/data.json', '', function () { console.log('') })
}
try {
// launching browser
const browser = await puppeteer.launch({
headless: false,
args: [
'--proxy-server=http://localhost:8080',
'--no-sandbox',
'--ignore-certificate-errors',
'--ignore-certificate-errors-spki-list'
],
ignoreHTTPSErrors: true
});
const page = await browser.newPage();
// okay browser launched...
// url is read in here via command line
const targetUrl = process.argv[2];
if (!targetUrl) {
console.error('Please provide a YouTube video URL as an argument');
process.exit(1);
}
console.log(`Visiting ${targetUrl}`);
await page.goto(targetUrl, { waitUntil: 'networkidle2' });
const scrollDown = async (count) => {
for (let i = 0; i < count; i++) {
await page.evaluate(() => {
window.scrollBy(0, window.innerHeight);
});
await sleep(3000);
}
};
// expand all comments "show more"
const clickAllButtons = async () => {
const buttons = await page.$$('button.yt-spec-button-shape-next.yt-spec-button-shape-next--text.yt-spec-button-shape-next--call-to-action.yt-spec-button-shape-next--size-m.yt-spec-button-shape-next--icon-leading.yt-spec-button-shape-next--align-by-text');
for (const button of buttons) {
console.log("Clicking button");
await button.click();
//await sleep(100);
}
};
await scrollDown(15);
await clickAllButtons();
await scrollDown(2);
await browser.close();
console.log("Comment scraping completed successfully.");
process.exit(0);
} catch (error) {
console.error(error);
process.exit(1);
}
}
run().catch((error) => {
console.error(error);
// we're shutting down the browser here
// keep in mind, we've only read in one url by this point... bottleneck!!
process.exit(1);
});