Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions clean.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
var script = "ry MongoDB today with Atlas, the global cloud database service that runs on AWS, Azure and Google Cloud. Configure, deploy and connect to your database in just a few minutes. Check it out at mongodb.com/atlas. That's mongodb.com/atlas. Thank you to MongoDB for being a sponsor of Software Engineering Daily.[END] © 2019 Software Engineering Daily"

var script2 = '© 2019 Software Engineering Daily'
var re = /©(.*?)Transcript/

var clean_re = /©(.*?)Software Engineering Daily\d$/

var cr_re = /© \d{4} Software Engineering Daily\d/gi
var page_re = /SED \d{3}/

const cleaned = script.replace(cr_re,'') //.replace(page_re,'').replace('Transcript','')
console.log(cleaned)
157 changes: 157 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"axios": "^0.18.0",
"bluebird": "^3.5.1",
"colors": "^1.3.0",
"crawler-request": "^1.2.2",
"dom-parser": "^0.1.5",
"dotenv": "^4.0.0",
"generate-password": "^1.4.0",
Expand Down
2 changes: 2 additions & 0 deletions run-all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ node makeThreadForEpisodes.js
# node links-add-images.js # hangs up, do we even need?
echo 'Getting transcript URLs'
node getTranscriptURL.js
echo 'Scrape transcript into markdown'
node scrapeTranscript.js
echo 'Getting guest images'
node getGuestImage.js
echo 'Getting tags from Wordpress'
Expand Down
100 changes: 100 additions & 0 deletions scrapeTranscript.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*
Script to crawl the translateURL for each post and save the transcribed transcript
to the post in the database.
*/

require('dotenv').config()

const crawler = require('crawler-request')
const db = require('monk')(process.env.MONGO_DB)
const posts = db.get('posts')
const Bluebird = require('bluebird')
const rp = require('request-promise')
const async = require('async')

const cleanScript = (script) => {
var cr_re = /© \d{4} Software Engineering Daily(.*?)\d{1,2}/gm //
var page_re = /SED \d{3}/g
var transcript_re = /Transcript/g
var end_re = /\[END\]/g
var nl_re = /\r?\n|\r/g
var episode_re = /EPISODE \d{1,3}/g

var talk_segment_re = /\[\d{1,2}\:\d{2}\:\d{2}(?:\.\d)?\](.*?)[^\[]*/g
var intro_re = /\[INTRODUCTION\](.*?)\[\d{1,2}\:\d{2}\:\d{2}(?:\.\d)?\](.*?)[^\[]*/g
var sponsor_re = /\[SPONSOR MESSAGE\](.*?)\[\d{1,2}\:\d{2}\:\d{2}(?:\.\d)?\](.*?)[^\[]*/g
var time_re = /\[\d{1,2}\:\d{2}\:\d{2}(?:\.\d)?\]/g
var talk_header_re = /\[\d{1,2}\:\d{2}\:\d{2}(?:\.\d)?\](.*?)[\:]/ig

return script.replace(nl_re,'')
.replace(page_re, '')
.replace(cr_re,'')
.replace(transcript_re,'')
.replace(talk_segment_re,"<p>$&</p>")
.replace(intro_re,"<span class=\"transcript-intro\">$&</span>")
.replace(sponsor_re,"<span class=\"transcript-sponsor\">$&</span>")
.replace(talk_header_re,"<span class=\"transcript-header\">$&</span>")
.replace(time_re, "<span class=\"transcript-time\">$&</span>")
.replace(episode_re,'')
.replace(end_re,'')
.replace(/\[INTRODUCTION\]/g,'')
.replace(/\[INTERVIEW\]/g,'')
.replace(/\[SPONSOR MESSAGE\]/g,'')
.replace(/\[INTERVIEW CONTINUED\]/g,'')
.replace(/\[END OF INTERVIEW\]/g,'')
}

var processingPosts = true
const CONCURRENCY = 5

var q = async.queue(function(post, callback) {
let { transcriptURL } = post
if (transcriptURL) {
crawler(post.transcriptURL)
.then(async function(response) {
if (response.text) {
const cleaned = cleanScript(response.text)
await posts.update({id: post.id}, {
$set: {
"transcript": cleaned
},
})
callback()
} else {
callback()
}
})
} else {
callback()
}
}, CONCURRENCY)

q.drain = function() {
// tasks may finish faster than added to queue, need to wait
if (!processingPosts) {
console.log('all items have been processed')
db.close()
}
}


let promises = [];
posts.find( {transcriptURL: {$exists: true}})
.each((post) => {
q.push(post, function (err) {
if (err) {
console.log(err)
} else {
console.log('finished processing', post.id)
}
});
})
.then(() => {
processingPosts = false;
})