Skip to content

Commit 8ee741a

Browse files
committed
Merge branch 'develop'
2 parents 23558d2 + bf5f45d commit 8ee741a

11 files changed

Lines changed: 111 additions & 64 deletions

File tree

Dockerfile

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,34 @@
11
FROM openjdk:17-jdk
22

3-
ARG JAR_FILE_PATH=/build/libs/*.jar
3+
RUN apt-get update && apt-get install -y --no-install-recommends \
4+
wget gnupg unzip \
5+
libnss3 libatk-bridge2.0-0 libgbm1 libgtk-3-0 \
6+
libx11-xcb1 libxrandr2 libxdamage1 libxcomposite1 libxss1 libasound2 \
7+
&& rm -rf /var/lib/apt/lists/*
48

5-
COPY $JAR_FILE_PATH app.jar
9+
# Chrome 설치
10+
RUN wget -qO- https://dl.google.com/linux/linux_signing_key.pub \
11+
| gpg --dearmor -o /usr/share/keyrings/google.gpg \
12+
&& echo "deb [signed-by=/usr/share/keyrings/google.gpg] \
13+
https://dl.google.com/linux/chrome/deb/ stable main" \
14+
> /etc/apt/sources.list.d/google.list \
15+
&& apt-get update \
16+
&& apt-get install -y --no-install-recommends \
17+
google-chrome-stable=135.0.7049.84-1 \
18+
&& rm -rf /var/lib/apt/lists/*
619

7-
EXPOSE 8080
20+
# Chromedriver 설치
21+
RUN wget -qO /tmp/chromedriver.zip \
22+
https://chromedriver.storage.googleapis.com/135.0.7049.84/chromedriver_linux64.zip \
23+
&& unzip /tmp/chromedriver.zip -d /usr/local/bin \
24+
&& chmod +x /usr/local/bin/chromedriver \
25+
&& rm /tmp/chromedriver.zip
26+
27+
# 환경 변수 설정
28+
ENV CHROME_OPTIONS="--headless --no-sandbox --disable-dev-shm-usage"
829

30+
# 애플리케이션 설정
31+
ARG JAR_FILE_PATH=/build/libs/*.jar
32+
COPY $JAR_FILE_PATH app.jar
33+
EXPOSE 8080
934
ENTRYPOINT ["java", "-jar", "app.jar"]

build.gradle

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ repositories {
2525

2626
dependencies {
2727
implementation 'org.springframework.boot:spring-boot-starter-data-jpa'
28-
implementation 'org.springframework.boot:spring-boot-starter-web'
2928
implementation 'org.springframework.boot:spring-boot-starter-mail'
3029
implementation 'org.springframework.boot:spring-boot-starter-thymeleaf'
3130
implementation 'org.springframework.boot:spring-boot-starter-validation'
@@ -46,6 +45,12 @@ dependencies {
4645
runtimeOnly 'com.mysql:mysql-connector-j'
4746
runtimeOnly 'com.h2database:h2'
4847
testRuntimeOnly 'org.junit.platform:junit-platform-launcher'
48+
49+
implementation('org.springframework.boot:spring-boot-starter-web') {
50+
exclude group: 'org.springframework.boot', module: 'spring-boot-starter-tomcat'
51+
}
52+
implementation 'org.springframework.boot:spring-boot-starter-undertow'
53+
4954
}
5055

5156
tasks.named('test') {

src/main/java/com/davcatch/devcatch/common/integration/crawling/WebCrawler.java

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ public class WebCrawler {
2727
public Optional<Document> getDocument(String link) {
2828
log.debug("크롤링 시작 : {}", link);
2929

30+
Document document = null;
3031
try {
3132
ResponseEntity<String> response = crawlingRestTemplate.getForEntity(link, String.class);
3233
String html = response.getBody();
@@ -36,14 +37,12 @@ public Optional<Document> getDocument(String link) {
3637
return Optional.empty();
3738
}
3839

39-
Document document = Jsoup.parse(html);
40+
document = Jsoup.parse(html);
4041
log.debug("크롤링 정상 수집 : {}", link);
41-
42-
return Optional.of(document);
43-
4442
} catch (Exception e) {
4543
log.error("({}) 크롤링 중 에러 발생 : {}", link, e.getMessage());
46-
return Optional.empty();
4744
}
45+
46+
return Optional.ofNullable(document);
4847
}
4948
}

src/main/java/com/davcatch/devcatch/common/integration/rss/RssReaderService.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ public class RssReaderService {
3030
public Optional<SyndFeed> reader(Source source) {
3131
log.debug("[{}] RSS FEED 수집 시작", source.getName());
3232

33+
SyndFeed feed = null;
3334
try {
3435
ResponseEntity<String> response = rssRestTemplate.getForEntity(source.getFeedUrl(), String.class);
3536

@@ -45,13 +46,12 @@ public Optional<SyndFeed> reader(Source source) {
4546
syndFeedInput.setPreserveWireFeed(true);
4647
syndFeedInput.setXmlHealerOn(true); // XML 문법 오류 자동 복구 활성화
4748

48-
SyndFeed feed = syndFeedInput.build(new StringReader(rssFeedXml));
49+
feed = syndFeedInput.build(new StringReader(rssFeedXml));
4950
log.debug("RSS FEED 정상 수집 : {}", source.getName());
50-
return Optional.of(feed);
51-
5251
} catch (FeedException e) {
5352
log.error("[{}] RSS FEED 수집중 오류 발생 : {}", source.getName(), e.getMessage());
54-
return Optional.empty();
5553
}
54+
55+
return Optional.ofNullable(feed);
5656
}
5757
}

src/main/java/com/davcatch/devcatch/common/integration/selenium/SeleniumBrowserService.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ public Optional<SyndFeed> reader(Source source) {
6565
log.debug("[{}] 셀레니움 헤드리스 RSS FEED 수집 시작", source.getName());
6666

6767
WebDriver webDriver = null;
68+
SyndFeed feed = null;
6869
try {
6970
webDriver = createWebDriver();
7071

@@ -93,13 +94,14 @@ public Optional<SyndFeed> reader(Source source) {
9394
syndFeedInput.setXmlHealerOn(true); // XML 문법 오류 자동 복구 활성화
9495

9596
String parse = Jsoup.parse(pageSource).body().text();
96-
SyndFeed feed = syndFeedInput.build(new StringReader(parse));
97-
return Optional.of(feed);
97+
98+
feed = syndFeedInput.build(new StringReader(parse));
9899
} catch (Exception e) {
99100
log.error("[{}] RSS FEED 수집중 오류 발생 : {}", source.getName(), e.getMessage());
100-
return Optional.empty();
101101
} finally {
102102
destroyWebDriver(webDriver);
103103
}
104+
105+
return Optional.ofNullable(feed);
104106
}
105107
}

src/main/java/com/davcatch/devcatch/common/scheduler/article/parser/strategy/AbstractArticleStrategy.java

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package com.davcatch.devcatch.common.scheduler.article.parser.strategy;
22

3+
import java.util.ArrayList;
34
import java.util.Collections;
45
import java.util.List;
56

@@ -25,7 +26,36 @@ public abstract class AbstractArticleStrategy implements ArticleParseStrategy {
2526
private final SeleniumBrowserService seleniumBrowserService;
2627
private final ContentExtractorFactory contentExtractorFactory;
2728

28-
protected List<SyndEntry> getEntries(Source source) {
29+
@Override
30+
public List<ParsedArticle> process(Source source) throws CustomException {
31+
ContentExtractorStrategy contentExtractor = getContentExtractor(source.getParseMethod());
32+
33+
List<SyndEntry> entries = fetchEntries(source);
34+
35+
if (entries.isEmpty()) {
36+
return Collections.emptyList();
37+
}
38+
39+
List<ParsedArticle> parsedArticles = new ArrayList<>();
40+
for (SyndEntry entry : entries) {
41+
try {
42+
ParsedArticle article = processEntry(entry, source, contentExtractor);
43+
if (article != null) {
44+
parsedArticles.add(article);
45+
}
46+
} catch (Exception e) {
47+
log.error("[{}] 엔트리 처리 중 오류 발생: {}", source.getName(), e.getMessage());
48+
}
49+
}
50+
51+
log.debug("[{}] 총 {}개 아티클 파싱 완료", source.getName(), parsedArticles.size());
52+
return parsedArticles;
53+
}
54+
55+
protected abstract List<SyndEntry> fetchEntries(Source source);
56+
protected abstract ParsedArticle processEntry(SyndEntry entry, Source source, ContentExtractorStrategy contentExtractor) throws CustomException;
57+
58+
protected List<SyndEntry> getEntriesFromRss(Source source) {
2959
return rssReaderService.reader(source)
3060
.map(SyndFeed::getEntries)
3161
.orElseGet(() -> {

src/main/java/com/davcatch/devcatch/common/scheduler/article/parser/strategy/CloudFlareParseStrategy.java

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
package com.davcatch.devcatch.common.scheduler.article.parser.strategy;
22

3-
import java.util.ArrayList;
43
import java.util.List;
54
import java.util.Set;
65

@@ -27,17 +26,14 @@ public CloudFlareParseStrategy(RssReaderService rssReaderService, SeleniumBrowse
2726
}
2827

2928
@Override
30-
public List<ParsedArticle> process(Source source) throws CustomException {
31-
ContentExtractorStrategy extractor = getContentExtractor(source.getParseMethod());
32-
List<SyndEntry> entries = getEntriesFromHeadless(source);
33-
34-
List<ParsedArticle> parsedArticles = new ArrayList<>();
35-
for (SyndEntry entry : entries) {
36-
String content = extractor.extractContent(entry, null);
37-
parsedArticles.add(ParsedArticle.of(entry, source, content));
38-
}
29+
protected List<SyndEntry> fetchEntries(Source source) {
30+
return getEntriesFromHeadless(source);
31+
}
3932

40-
return parsedArticles;
33+
@Override
34+
protected ParsedArticle processEntry(SyndEntry entry, Source source, ContentExtractorStrategy contentExtractor) {
35+
String content = contentExtractor.extractContent(entry, null);
36+
return ParsedArticle.of(entry, source, content);
4137
}
4238

4339
@Override

src/main/java/com/davcatch/devcatch/common/scheduler/article/parser/strategy/CrawlingParseStrategy.java

Lines changed: 15 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
11
package com.davcatch.devcatch.common.scheduler.article.parser.strategy;
22

3-
import java.util.ArrayList;
43
import java.util.List;
54
import java.util.Set;
65

76
import org.jsoup.nodes.Document;
87
import org.springframework.stereotype.Component;
98

109
import com.davcatch.devcatch.common.exception.CustomException;
11-
import com.davcatch.devcatch.common.integration.selenium.SeleniumBrowserService;
12-
import com.davcatch.devcatch.domain.source.ParseMethod;
13-
import com.davcatch.devcatch.domain.source.Source;
10+
import com.davcatch.devcatch.common.exception.ErrorCode;
1411
import com.davcatch.devcatch.common.integration.crawling.WebCrawler;
1512
import com.davcatch.devcatch.common.integration.rss.RssReaderService;
13+
import com.davcatch.devcatch.common.integration.selenium.SeleniumBrowserService;
1614
import com.davcatch.devcatch.common.scheduler.article.dto.ParsedArticle;
1715
import com.davcatch.devcatch.common.scheduler.article.extractor.factory.ContentExtractorFactory;
1816
import com.davcatch.devcatch.common.scheduler.article.extractor.strategy.ContentExtractorStrategy;
17+
import com.davcatch.devcatch.domain.source.ParseMethod;
18+
import com.davcatch.devcatch.domain.source.Source;
1919
import com.rometools.rome.feed.synd.SyndEntry;
2020

2121
import lombok.extern.slf4j.Slf4j;
@@ -33,24 +33,19 @@ public CrawlingParseStrategy(RssReaderService rssReaderService, SeleniumBrowserS
3333
}
3434

3535
@Override
36-
public List<ParsedArticle> process(Source source) throws CustomException {
37-
ContentExtractorStrategy extractor = getContentExtractor(source.getParseMethod());
38-
List<SyndEntry> entries = getEntries(source);
39-
40-
List<ParsedArticle> parsedArticles = new ArrayList<>();
41-
for (int i = 0; i < Math.min(MAX_PARSE_PAGE, entries.size()) ; i++) {
42-
SyndEntry entry = entries.get(i);
43-
44-
String link = source.isUseLink() ? entry.getLink() : entry.getUri();
45-
Document document = webCrawler.getDocument(link).orElse(null);
46-
if (document == null)
47-
continue;
36+
protected List<SyndEntry> fetchEntries(Source source) {
37+
List<SyndEntry> entries = getEntriesFromRss(source);
38+
return entries.subList(0, Math.min(MAX_PARSE_PAGE, entries.size()));
39+
}
4840

49-
String content = extractor.extractContent(null, document);
50-
parsedArticles.add(ParsedArticle.of(entry, source, content));
51-
}
41+
@Override
42+
protected ParsedArticle processEntry(SyndEntry entry, Source source, ContentExtractorStrategy contentExtractor) throws CustomException {
43+
String link = source.isUseLink() ? entry.getLink() : entry.getUri();
44+
Document document = webCrawler.getDocument(link)
45+
.orElseThrow(() -> new CustomException(ErrorCode.CONTENT_PARSE_ERROR));
5246

53-
return parsedArticles;
47+
String content = contentExtractor.extractContent(null, document);
48+
return ParsedArticle.of(entry, source, content);
5449
}
5550

5651
@Override

src/main/java/com/davcatch/devcatch/common/scheduler/article/parser/strategy/RssParseStrategy.java

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,17 @@
11
package com.davcatch.devcatch.common.scheduler.article.parser.strategy;
22

3-
import java.util.ArrayList;
43
import java.util.List;
54
import java.util.Set;
65

76
import org.springframework.stereotype.Component;
87

8+
import com.davcatch.devcatch.common.integration.rss.RssReaderService;
99
import com.davcatch.devcatch.common.integration.selenium.SeleniumBrowserService;
1010
import com.davcatch.devcatch.common.scheduler.article.dto.ParsedArticle;
11+
import com.davcatch.devcatch.common.scheduler.article.extractor.factory.ContentExtractorFactory;
1112
import com.davcatch.devcatch.common.scheduler.article.extractor.strategy.ContentExtractorStrategy;
1213
import com.davcatch.devcatch.domain.source.ParseMethod;
1314
import com.davcatch.devcatch.domain.source.Source;
14-
import com.davcatch.devcatch.common.exception.CustomException;
15-
import com.davcatch.devcatch.common.integration.rss.RssReaderService;
16-
import com.davcatch.devcatch.common.scheduler.article.extractor.factory.ContentExtractorFactory;
1715
import com.rometools.rome.feed.synd.SyndEntry;
1816

1917
import lombok.extern.slf4j.Slf4j;
@@ -27,17 +25,14 @@ public RssParseStrategy(RssReaderService rssReaderService, SeleniumBrowserServic
2725
}
2826

2927
@Override
30-
public List<ParsedArticle> process(Source source) throws CustomException {
31-
ContentExtractorStrategy extractor = getContentExtractor(source.getParseMethod());
32-
List<SyndEntry> entries = getEntries(source);
33-
34-
List<ParsedArticle> parsedArticles = new ArrayList<>();
35-
for (SyndEntry entry : entries) {
36-
String content = extractor.extractContent(entry, null);
37-
parsedArticles.add(ParsedArticle.of(entry, source, content));
38-
}
28+
protected List<SyndEntry> fetchEntries(Source source) {
29+
return getEntriesFromRss(source);
30+
}
3931

40-
return parsedArticles;
32+
@Override
33+
protected ParsedArticle processEntry(SyndEntry entry, Source source, ContentExtractorStrategy contentExtractor) {
34+
String content = contentExtractor.extractContent(entry, null);
35+
return ParsedArticle.of(entry, source, content);
4136
}
4237

4338
@Override

src/main/java/com/davcatch/devcatch/repository/tag/TagRepository.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@
1111

1212
public interface TagRepository extends JpaRepository<Tag, Long> {
1313

14-
Optional<Tag> findByTagType(TagType tagType);
15-
1614
@Query("select t from Tag t where t.tagType in :tagTypes")
1715
List<Tag> findInTagType(List<TagType> tagTypes);
1816
}

0 commit comments

Comments
 (0)