Skip to content

Commit 0e19fb9

Browse files
committed
News Crawler day #8
1 parent 90fed9e commit 0e19fb9

8 files changed

Lines changed: 276 additions & 15 deletions

File tree

BerlinAIT/NewsCrawler/src/main/java/news/crawler/common/DateTimeUtils.java

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,36 +9,57 @@
99
@Slf4j
1010
public class DateTimeUtils {
1111

12-
private static String[] monthsRu = {"января","февраля","марта","апреля","мая","июня",
13-
"июля","августа","сентября","октября","ноября","декабря"};
12+
private final static String[] MONTHS_RU = {"января","февраля","марта","апреля","мая","июня","июля","августа",
13+
"сентября","октября","ноября","декабря"};
14+
private final static String[] MONTHS_CZ = {"ledna","února","března","dubna","května","června","července","srpna",
15+
"září","října","listopadu","prosince"};
1416

15-
public static LocalDateTime convertDateTime(String dateTime) {
16-
String[] dt = dateTime.split("[ ,]+");
17-
if (dt[0].length() == 1) {
18-
dt[0] = "0" + dt[0];
17+
/**
18+
* Convert date-time in String to LocalDateTime
19+
*
20+
* @param strDateTime
21+
* @return LocalDateTime
22+
*/
23+
public static LocalDateTime convertDateTime(String strDateTime) {
24+
LocalDateTime dateTime = null;
25+
String[] dt = strDateTime.split("[ ,\\.]+");
26+
if (dt.length == 5) {
27+
dt = new String[]{dt[1], dt[2], dt[3], dt[4]};
1928
}
2029

21-
int idx = Arrays.asList(monthsRu).indexOf(dt[1].toLowerCase());
22-
dt[1] = String.valueOf(idx + 1);
23-
if (dt[1].length() == 1) {
24-
dt[1] = "0" + dt[1];
30+
int idx = Arrays.asList(MONTHS_RU).indexOf(dt[1].toLowerCase());
31+
if (idx < 0) {
32+
idx = Arrays.asList(MONTHS_CZ).indexOf(dt[1].toLowerCase());
2533
}
34+
dt[1] = String.valueOf(idx + 1);
2635

27-
return LocalDateTime.parse(String.join(" ", dt), DateTimeFormatter.ofPattern("dd MM yyyy HH:mm"));
36+
try {
37+
dateTime = LocalDateTime.parse(String.join(" ", dt), DateTimeFormatter.ofPattern("d M yyyy HH:mm"));
38+
} catch (Exception e) {
39+
log.error(e.getMessage());
40+
}
41+
return dateTime;
2842
}
2943

44+
/**
45+
* Convert date and time in String to LocalDateTime
46+
*
47+
* @param date
48+
* @param time
49+
* @return LocalDateTime
50+
*/
3051
public static LocalDateTime convertDateTime(String date, String time) {
3152
LocalDateTime dateTime = null;
3253
try {
33-
dateTime = LocalDateTime.parse(date + " " + time, DateTimeFormatter.ofPattern("dd.MM.yyyy HH:mm"));
54+
dateTime = LocalDateTime.parse(date + " " + time, DateTimeFormatter.ofPattern("d.M.yyyy HH:mm"));
3455
} catch (Exception e) {
3556
log.error(e.getMessage());
3657
}
3758
return dateTime;
3859
}
3960

4061
public static void main(String[] args) {
41-
String dt = "7 Сентября, 2023 15:39";
62+
String dt = "вторник, 17. listopadu, 2023 15:39";
4263
System.out.println(convertDateTime(dt));
4364
}
4465
}

BerlinAIT/NewsCrawler/src/main/java/news/crawler/service/executor/CrawlerExecutor.java

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33
import lombok.extern.slf4j.Slf4j;
44
import news.crawler.controller.dto.EventDTO;
55
import news.crawler.domain.SourceConfig;
6-
import news.crawler.repository.EventRepository;
76
import news.crawler.repository.SourceConfigRepository;
87
import news.crawler.service.EventService;
98
import org.springframework.beans.factory.annotation.Autowired;
9+
import org.springframework.beans.factory.annotation.Value;
1010
import org.springframework.context.SmartLifecycle;
1111
import org.springframework.stereotype.Service;
1212

@@ -19,6 +19,12 @@ public class CrawlerExecutor implements SmartLifecycle {
1919

2020
//static final Logger log = LoggerFactory.getLogger(CrawlerExecutor.class);
2121

22+
@Value("${executor.enabled:true}")
23+
private boolean serviceEnabled;
24+
25+
@Value("${executor.waitMin:60}")
26+
private int waitMin;
27+
2228
@Autowired
2329
private SourceConfigRepository sourceConfigRepository;
2430

@@ -56,7 +62,7 @@ public void run() {
5662
}
5763

5864
try {
59-
lock.wait(1000 * 60 * 60);
65+
lock.wait(1000 * 60 * waitMin);
6066
} catch (InterruptedException e) {
6167
log.error(e.getMessage());
6268
break;
@@ -70,6 +76,11 @@ public void run() {
7076
@Override
7177
public void start() {
7278
log.info("Service starting...");
79+
if (!serviceEnabled) {
80+
log.info("Service is disabled.");
81+
status = ThreadStatus.STOPPED;
82+
return;
83+
}
7384
status = ThreadStatus.RUNNING;
7485
new Thread(() -> {
7586
run();
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
package news.crawler.service.executor;
2+
3+
import lombok.extern.slf4j.Slf4j;
4+
import news.crawler.common.DateTimeUtils;
5+
import news.crawler.controller.dto.EventDTO;
6+
import news.crawler.domain.SourceConfig;
7+
import org.jsoup.Jsoup;
8+
import org.jsoup.nodes.Document;
9+
import org.jsoup.nodes.Element;
10+
import org.jsoup.select.Elements;
11+
12+
import java.io.IOException;
13+
import java.time.LocalDateTime;
14+
import java.util.ArrayList;
15+
import java.util.List;
16+
17+
@Slf4j
18+
public class ExecEKladensko implements Execute {
19+
@Override
20+
public List<EventDTO> execute(SourceConfig config) {
21+
List<EventDTO> events = new ArrayList<>();
22+
23+
try {
24+
Document root = Jsoup.connect(config.getRootUrl() + config.getNewsSuffix()).get();
25+
Elements links = root.getElementsByClass("latestItemTitle");
26+
for (Element element : links) {
27+
Element href = element.select("a").first();
28+
String title = href.text();
29+
String newsUrl = config.getRootUrl() + href.attr("href");
30+
31+
// read news page and extract date-time and text
32+
if (newsUrl.contains(config.getNewsSuffix())) {
33+
log.info("Reading news from {}...", newsUrl);
34+
35+
Document news = Jsoup.connect(newsUrl).get();
36+
37+
LocalDateTime dateTime = null;
38+
Element elementDateTime = news.getElementsByClass("itemHits").first();
39+
if (elementDateTime != null) {
40+
String[] newsDate = elementDateTime.text().split(" ");
41+
dateTime = DateTimeUtils.convertDateTime(newsDate[0], newsDate[1]);
42+
}
43+
44+
String imgUrl = null;
45+
Element imgElement = news.getElementsByClass("itemImage").first();
46+
if (imgElement != null) {
47+
Element imageHref = imgElement.select("img").first();
48+
imgUrl = config.getRootUrl() + imageHref.attr("src");
49+
}
50+
String text = null;
51+
Element introtext = news.getElementsByClass("introtext").first();
52+
if (introtext != null) {
53+
text = introtext.text();
54+
}
55+
Element fulltext = news.getElementsByClass("fulltext").first();
56+
if (fulltext != null) {
57+
Elements paragraphs = fulltext.select("p");
58+
for (Element p : paragraphs) {
59+
if (!p.text().isEmpty()) {
60+
text = text + "\n" + p.text();
61+
}
62+
}
63+
}
64+
65+
events.add(new EventDTO(title, newsUrl, dateTime, text, imgUrl));
66+
}
67+
}
68+
} catch (IOException e) {
69+
log.error(e.getMessage());
70+
}
71+
72+
return events;
73+
}
74+
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
package news.crawler.service.executor;
2+
3+
import lombok.extern.slf4j.Slf4j;
4+
import news.crawler.common.DateTimeUtils;
5+
import news.crawler.controller.dto.EventDTO;
6+
import news.crawler.domain.SourceConfig;
7+
import org.jsoup.Jsoup;
8+
import org.jsoup.nodes.Document;
9+
import org.jsoup.nodes.Element;
10+
import org.jsoup.select.Elements;
11+
12+
import java.io.IOException;
13+
import java.time.LocalDateTime;
14+
import java.util.ArrayList;
15+
import java.util.List;
16+
17+
@Slf4j
18+
public class ExecKladenskelisty implements Execute {
19+
@Override
20+
public List<EventDTO> execute(SourceConfig config) {
21+
List<EventDTO> events = new ArrayList<>();
22+
23+
try {
24+
Document root = Jsoup.connect(config.getRootUrl()).get();
25+
Elements links = root.select("h4");
26+
for (Element element : links) {
27+
Element href = element.select("a").first();
28+
String title = href.text();
29+
String newsUrl = href.attr("href");
30+
31+
log.info("Reading news from {}...", newsUrl);
32+
33+
Document news = Jsoup.connect(newsUrl).get();
34+
Element cite = news.select("cite").first();
35+
String stringDate = cite.text();
36+
LocalDateTime dateTime = DateTimeUtils.convertDateTime(stringDate);
37+
38+
String text = null;
39+
Element main = news.select("main").first();
40+
Elements paragraphs = main.select("p");
41+
for (Element p : paragraphs) {
42+
if (!p.text().isEmpty()) {
43+
text = (text != null? text + "\n" : "") + p.text();
44+
}
45+
}
46+
String imgUrl = null;
47+
48+
events.add(new EventDTO(title, newsUrl, dateTime, text, imgUrl));
49+
}
50+
} catch (IOException e) {
51+
log.error(e.getMessage());
52+
}
53+
54+
return events;
55+
}
56+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
1+
server:
2+
port: 8080
3+
14
spring:
25
datasource:
36
url: jdbc:postgresql://localhost:5432/news_crawler
47
username: postgres
58
password: root
69

10+
executor:
11+
waitMin: 30
12+
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package news.crawler;
2+
3+
import news.crawler.common.DateTimeUtils;
4+
import org.junit.jupiter.api.Assertions;
5+
import org.junit.jupiter.api.Test;
6+
7+
import java.time.LocalDateTime;
8+
import java.time.format.DateTimeFormatter;
9+
10+
public class DateTimeUtilsTest {
11+
12+
@Test
13+
public void testConvertDateTime() {
14+
String strDateTime = "17 октября, 2023 15:39";
15+
LocalDateTime dateTime = LocalDateTime.parse("17.10.2023 15:39", DateTimeFormatter.ofPattern("d.M.yyyy HH:mm"));
16+
Assertions.assertEquals(dateTime, DateTimeUtils.convertDateTime(strDateTime));
17+
18+
strDateTime = "Čtvrtek, 30. listopadu 2023 08:32";
19+
dateTime = LocalDateTime.parse("30.11.2023 08:32", DateTimeFormatter.ofPattern("d.M.yyyy HH:mm"));
20+
Assertions.assertEquals(dateTime, DateTimeUtils.convertDateTime(strDateTime));
21+
22+
String strDate = "15.9.2023";
23+
String strTime = "12:25";
24+
dateTime = LocalDateTime.parse(strDate + " " + strTime, DateTimeFormatter.ofPattern("d.M.yyyy HH:mm"));
25+
Assertions.assertEquals(dateTime, DateTimeUtils.convertDateTime(strDate, strTime));
26+
}
27+
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
package news.crawler;
2+
3+
import news.crawler.controller.dto.SourceConfigDTO;
4+
import news.crawler.service.SourceConfigService;
5+
import org.junit.jupiter.api.*;
6+
import org.springframework.beans.factory.annotation.Autowired;
7+
import org.springframework.boot.test.context.SpringBootTest;
8+
9+
import java.util.List;
10+
11+
@SpringBootTest
12+
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
13+
public class SourceConfigServiceTest {
14+
15+
@Autowired
16+
private SourceConfigService configService;
17+
18+
@Test
19+
@Order(1)
20+
public void testAdd() {
21+
SourceConfigDTO sourceCFG = new SourceConfigDTO(null, "root", "news", "className", null);
22+
SourceConfigDTO config = configService.add(sourceCFG);
23+
Assertions.assertNotNull(config.getId());
24+
25+
List<SourceConfigDTO> configs = configService.findAll();
26+
Assertions.assertEquals(1, configs.size());
27+
28+
SourceConfigDTO cfg = configs.get(0);
29+
Assertions.assertEquals(config.getId(), cfg.getId());
30+
Assertions.assertEquals(config.getRootUrl(), cfg.getRootUrl());
31+
Assertions.assertEquals(config.getNewsSuffix(), cfg.getNewsSuffix());
32+
Assertions.assertEquals(config.getClassName(), cfg.getClassName());
33+
}
34+
35+
@Test
36+
@Order(2)
37+
public void testUpdate() {
38+
SourceConfigDTO sourceCFG = new SourceConfigDTO(1, "rootNew", "news", "className", null);
39+
SourceConfigDTO config = configService.update(sourceCFG);
40+
Assertions.assertNotNull(config);
41+
Assertions.assertEquals(sourceCFG.getRootUrl(), config.getRootUrl());
42+
43+
List<SourceConfigDTO> configs = configService.findAll();
44+
Assertions.assertEquals(1, configs.size());
45+
}
46+
47+
@Test
48+
@Order(3)
49+
public void testDelete() {
50+
SourceConfigDTO config = configService.delete(1);
51+
Assertions.assertNotNull(config);
52+
53+
List<SourceConfigDTO> configs = configService.findAll();
54+
Assertions.assertEquals(0, configs.size());
55+
}
56+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
spring:
2+
datasource:
3+
driverClassName: org.h2.Driver
4+
url: jdbc:h2:mem:db
5+
username: sa
6+
password:
7+
8+
executor:
9+
enabled: false
10+

0 commit comments

Comments
 (0)