RParallelCompute/Lecture7.R at main · Wadougi1/RParallelCompute · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

# 7.Coding Session: Real Example of Slow Web Scrapping
## Advanced Scrapping Function
library(dplyr); library(tidyr); library(tibble);
library(tidytext); library(rvest);
library(tictoc)

# Pay attention to memory!

get_ngramms <- function(url){
  read_html(url) %>%
    html_elements("p") %>%
    html_text() %>%
    enframe() %>%
    drop_na() %>%
    rename(line = 1, text = 2) %>%
    unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
    filter(!is.na(bigram)) %>%
    separate(bigram, c("word1", "word2"), sep = " ") %>%
    filter(!word1 %in% stop_words$word) %>%
    filter(!word2 %in% stop_words$word) %>%
    count(word1, word2, sort = TRUE) %>%
    unite(bigram, word1, word2, sep = " ") %>%
    filter(n >= 5)
}

# First example
url <- 'https://en.wikipedia.org/wiki/United_States'

tic()
usa_words <- get_ngramms(url)
toc()

print(usa_words)

# All countries, one by one?
all_countries <- read.csv('country_links.csv') %>%
  pull(1)

## Function Breakdown -- Extra ----

read_html(url) %>%
  html_elements("p") %>%
  html_text() %>%
  enframe() %>%
  drop_na() %>%
  rename(line = 1, text = 2) %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  filter(!is.na(bigram)) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  count(word1, word2, sort = TRUE) %>%
  unite(bigram, word1, word2, sep = " ") %>%
  filter(n >= 5)