-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathindex.js
More file actions
117 lines (103 loc) · 3.57 KB
/
index.js
File metadata and controls
117 lines (103 loc) · 3.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
var async = require('async');
var config = require('./config');
var Reddit = require('./lib/reddit');
var Scraper = require('./lib/scraper');
var Log = require('./lib/logger');
var _ = require('lodash');
var domains = [];
var services = [];
var subreddits = config.subreddits;
var stats = {
totalImagesDownloaded: 0,
totalImagesExisting: 0,
totalPagesScraped: 0,
totalSubredditsScraped: 0
};
console.log();
console.log('-----------------------------------------------');
Log.info('Initializing ' + config.reddit.userAgent);
if ( config.gfycat.enabled ) {
services.push('gfycat');
domains.push(config.gfycat.domains);
}
if ( config.imgur.enabled ) {
services.push('imgur');
domains.push(config.imgur.domains);
}
if ( config.redditmedia.enabled ) {
services.push('redditmedia');
domains.push(config.redditmedia.domains);
}
domains = _.flatten(domains);
var sr = subreddits.map(function (subreddit) {
return subreddit.name;
});
Log.info(' Subreddits: ' + sr);
Log.info(' Services: ' + services.join(', '));
console.log('-----------------------------------------------');
async.each(subreddits, function (subreddit, subCallback) {
Log.info('Started Processing /r/' + subreddit.name);
function scraperCallback(err, pages) {
if (err) {
throw err;
}
// iterate over each page of each subreddit
if ( Array.isArray(pages) ) {
var Images = [];
var imagesDownloaded = 0;
var existingImagesCount = 0;
stats.totalSubredditsScraped++;
async.forEachOf(pages, function (page, index, callback) {
// scrape the subreddit page
Scraper.scrape(subreddit, page, { domains: domains }, function (err, images) {
images = _.flatten(images);
Images.push( images );
stats.totalPagesScraped++;
callback();
});
}, function (err) {
if (err) {
Log.error('Error processing ' + subreddit.name + ': ' + err);
}
Images = _.flatten(Images);
async.each(Images, function (image, cb) {
Scraper.download(image.url, image.imgPath, function (err, wasDownloaded) {
if (wasDownloaded) {
imagesDownloaded++;
} else {
existingImagesCount++;
}
setTimeout(function () {
cb(null);
}, 3000);
});
}, function (err) {
if (err) {
Log.warn(err);
}
Log.info('Finished with /r/' + subreddit.name);
Log.info(' Downloaded ' + imagesDownloaded, 'images.');
Log.info(' ' + existingImagesCount, 'images already existing.');
stats.totalImagesDownloaded += imagesDownloaded;
stats.totalImagesExisting += existingImagesCount;
subCallback(err);
});
});
}
}
// stub out mock data for dev/test mode
if ( process.env.NODE_ENV === 'development' || process.env.NODE_ENV === 'test') {
scraperCallback(null, pages);
} else {
Reddit.getData(config.reddit.url, subreddit, scraperCallback);
}
}, function () {
console.log();
console.log('--------------------------');
Log.info('Scraper run finished');
Log.info(' Total Images Downloaded:', stats.totalImagesDownloaded);
Log.info(' Total Pre-existing Images:', stats.totalImagesExisting);
Log.info(' Total Subreddits Scraped:', stats.totalSubredditsScraped);
Log.info(' Total Pages Scraped:', stats.totalPagesScraped);
console.log('--------------------------');
});