Skip to content

Commit 1b07089

Browse files
committed
fix: Selenium Headless 브라우저 사용 #26
1 parent bcf60f3 commit 1b07089

17 files changed

Lines changed: 257 additions & 108 deletions

File tree

build.gradle

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ dependencies {
3131
implementation 'org.springframework.boot:spring-boot-starter-validation'
3232
implementation 'com.github.ben-manes.caffeine:caffeine:3.2.0'
3333
implementation 'org.apache.httpcomponents.client5:httpclient5:5.4.3'
34+
implementation 'org.seleniumhq.selenium:selenium-java:4.31.0'
3435
implementation 'org.jsoup:jsoup:1.18.3'
3536
implementation 'com.rometools:rome:2.1.0'
3637
implementation 'org.springframework.boot:spring-boot-starter-security'

src/main/java/com/davcatch/devcatch/admin/service/schduler/SchedulerService.java

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package com.davcatch.devcatch.admin.service.schduler;
22

3+
import lombok.Builder;
34
import lombok.Data;
45
import lombok.RequiredArgsConstructor;
56
import lombok.extern.slf4j.Slf4j;
@@ -30,22 +31,24 @@ public String runArticleCreationScheduler() {
3031
articleSchedulerService.createNewArticle();
3132

3233
// 성공 이력 저장
33-
SchedulerExecution execution = new SchedulerExecution();
34-
execution.setSchedulerName("아티클 수집 및 생성");
35-
execution.setExecutionTime(startTime);
36-
execution.setStatus("성공");
37-
execution.setDetails("수동 실행으로 신규 아티클 수집 완료");
34+
SchedulerExecution execution = SchedulerExecution.builder()
35+
.schedulerName("아티클 수집 및 생성")
36+
.executionTime(startTime)
37+
.status("성공")
38+
.details("수동 실행으로 신규 아티클 수집 완료")
39+
.build();
3840

3941
return "아티클 수집 및 생성 스케줄러가 성공적으로 실행되었습니다.";
4042
} catch (Exception e) {
4143
log.error("아티클 생성 스케줄러 실행 중 오류: {}", e.getMessage(), e);
4244

4345
// 실패 이력 저장
44-
SchedulerExecution execution = new SchedulerExecution();
45-
execution.setSchedulerName("아티클 수집 및 생성");
46-
execution.setExecutionTime(startTime);
47-
execution.setStatus("실패");
48-
execution.setDetails("오류: " + e.getMessage());
46+
SchedulerExecution execution = SchedulerExecution.builder()
47+
.schedulerName("아티클 수집 및 생성")
48+
.executionTime(startTime)
49+
.status("실패")
50+
.details("오류: " + e.getMessage())
51+
.build();
4952

5053
return "아티클 수집 중 오류가 발생했습니다: " + e.getMessage();
5154
}
@@ -63,22 +66,24 @@ public String runArticleNotificationScheduler() {
6366
articleNotificationService.sendNewArticle();
6467

6568
// 성공 이력 저장
66-
SchedulerExecution execution = new SchedulerExecution();
67-
execution.setSchedulerName("아티클 이메일 발송");
68-
execution.setExecutionTime(startTime);
69-
execution.setStatus("성공");
70-
execution.setDetails("수동 실행으로 이메일 발송 완료");
69+
SchedulerExecution execution = SchedulerExecution.builder()
70+
.schedulerName("아티클 이메일 발송")
71+
.executionTime(startTime)
72+
.status("성공")
73+
.details("수동 실행으로 이메일 발송 완료")
74+
.build();
7175

7276
return "아티클 이메일 발송 스케줄러가 성공적으로 실행되었습니다.";
7377
} catch (Exception e) {
7478
log.error("아티클 발송 스케줄러 실행 중 오류: {}", e.getMessage(), e);
7579

7680
// 실패 이력 저장
77-
SchedulerExecution execution = new SchedulerExecution();
78-
execution.setSchedulerName("아티클 이메일 발송");
79-
execution.setExecutionTime(startTime);
80-
execution.setStatus("실패");
81-
execution.setDetails("오류: " + e.getMessage());
81+
SchedulerExecution execution = SchedulerExecution.builder()
82+
.schedulerName("아티클 이메일 발송")
83+
.executionTime(startTime)
84+
.status("실패")
85+
.details("오류: " + e.getMessage())
86+
.build();
8287

8388
return "아티클 이메일 발송 중 오류가 발생했습니다: " + e.getMessage();
8489
}
@@ -88,6 +93,7 @@ public String runArticleNotificationScheduler() {
8893
* 스케줄러 실행 이력 DTO
8994
*/
9095
@Data
96+
@Builder
9197
public static class SchedulerExecution {
9298
private String schedulerName;
9399
private LocalDateTime executionTime;

src/main/java/com/davcatch/devcatch/common/config/GPTConfig.java

Lines changed: 0 additions & 29 deletions
This file was deleted.

src/main/java/com/davcatch/devcatch/common/integration/crawling/WebCrawler.java

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,16 +31,19 @@ public Optional<Document> getDocument(String link) {
3131
ResponseEntity<String> response = crawlingRestTemplate.getForEntity(link, String.class);
3232
String html = response.getBody();
3333

34-
if (html != null) {
35-
Document document = Jsoup.parse(html);
36-
log.debug("크롤링 정상 수집 : {}", link);
37-
38-
return Optional.of(document);
34+
if (html == null || html.isBlank()) {
35+
log.warn("[{}] 해당 페이지를 가져올 수 없습니다", link);
36+
return Optional.empty();
3937
}
38+
39+
Document document = Jsoup.parse(html);
40+
log.debug("크롤링 정상 수집 : {}", link);
41+
42+
return Optional.of(document);
43+
4044
} catch (Exception e) {
4145
log.error("({}) 크롤링 중 에러 발생 : {}", link, e.getMessage());
46+
return Optional.empty();
4247
}
43-
44-
return Optional.empty();
4548
}
4649
}

src/main/java/com/davcatch/devcatch/common/integration/gpt/GptSummaryService.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
@Slf4j
1818
public class GptSummaryService {
1919

20-
private static final String URL = "https://api.openai.com/v1/chat/completions";
20+
private static final String GPT_API_URL = "https://api.openai.com/v1/chat/completions";
2121

2222
private final RestTemplate gptApiRestTemplate;
2323

@@ -35,11 +35,13 @@ public class GptSummaryService {
3535
public GptResponse getSummary(String content) throws CustomException {
3636
log.debug("GPT API 요청 시작");
3737

38-
GptResponse response;
38+
GptResponse response = null;
3939

4040
try {
4141
GptRequest request = GptRequest.create(content, model, sysPrompt);
42-
response = gptApiRestTemplate.postForObject(URL, request, GptResponse.class);
42+
System.out.println("gptApiRestTemplate.postForObject(GPT_API_URL, request, GptResponse.class) = "
43+
+ gptApiRestTemplate.postForObject(GPT_API_URL, request, GptResponse.class));
44+
response = gptApiRestTemplate.postForObject(GPT_API_URL, request, GptResponse.class);
4345
} catch (Exception e) {
4446
log.error("GPT API 요청중 에러 발생 : {}", e.getMessage());
4547
throw new CustomException(ErrorCode.GPT_REQUEST_ERROR);

src/main/java/com/davcatch/devcatch/common/integration/gpt/response/GptResponse.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22

33
import java.util.List;
44

5+
import lombok.AllArgsConstructor;
56
import lombok.Data;
67

78
@Data
9+
@AllArgsConstructor
810
public class GptResponse {
911

1012
private List<Choices> choices;

src/main/java/com/davcatch/devcatch/common/integration/rss/RssReaderService.java

Lines changed: 19 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -21,41 +21,37 @@
2121
public class RssReaderService {
2222

2323
private final RestTemplate rssRestTemplate;
24-
private final RestTemplate cloudflareRssRestTemplate;
2524

2625
/**
2726
* RSS FEED 파싱
28-
* @param feedUrl Feed URL
27+
* @param source 해당 소스
2928
* @return Feed
3029
*/
3130
public Optional<SyndFeed> reader(Source source) {
32-
log.debug("RSS FEED 수집 시작 : {}", source.getFeedUrl());
31+
log.debug("[{}] RSS FEED 수집 시작", source.getName());
3332

3433
try {
35-
ResponseEntity<String> response;
36-
if (source.getName().equals("우아한형제들")) {
37-
cloudflareRssRestTemplate.getForEntity(source.getMainUrl(), String.class);
38-
response = cloudflareRssRestTemplate.getForEntity(source.getFeedUrl(), String.class);
39-
} else
40-
response = rssRestTemplate.getForEntity(source.getFeedUrl(), String.class);
41-
42-
String xml = response.getBody();
43-
44-
if (xml != null) {
45-
SyndFeedInput syndFeedInput = new SyndFeedInput();
46-
syndFeedInput.setAllowDoctypes(true);
47-
syndFeedInput.setPreserveWireFeed(true);
48-
syndFeedInput.setXmlHealerOn(true); // XML 문법 오류 자동 복구 활성화
49-
50-
SyndFeed feed = syndFeedInput.build(new StringReader(xml));
51-
log.debug("RSS FEED 정상 수집 : {}", source.getName());
52-
return Optional.of(feed);
34+
ResponseEntity<String> response = rssRestTemplate.getForEntity(source.getFeedUrl(), String.class);
35+
36+
String rssFeedXml = response.getBody();
37+
38+
if (rssFeedXml == null || rssFeedXml.isBlank()) {
39+
log.warn("[{}] RSS FEED를 가져올 수 없습니다", source.getName());
40+
return Optional.empty();
5341
}
5442

43+
SyndFeedInput syndFeedInput = new SyndFeedInput();
44+
syndFeedInput.setAllowDoctypes(true);
45+
syndFeedInput.setPreserveWireFeed(true);
46+
syndFeedInput.setXmlHealerOn(true); // XML 문법 오류 자동 복구 활성화
47+
48+
SyndFeed feed = syndFeedInput.build(new StringReader(rssFeedXml));
49+
log.debug("RSS FEED 정상 수집 : {}", source.getName());
50+
return Optional.of(feed);
51+
5552
} catch (FeedException e) {
5653
log.error("[{}] RSS FEED 수집중 오류 발생 : {}", source.getName(), e.getMessage());
54+
return Optional.empty();
5755
}
58-
59-
return Optional.empty();
6056
}
6157
}
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
package com.davcatch.devcatch.common.integration.selenium;
2+
3+
import java.io.StringReader;
4+
import java.util.Optional;
5+
6+
import org.jsoup.Jsoup;
7+
import org.openqa.selenium.JavascriptExecutor;
8+
import org.openqa.selenium.WebDriver;
9+
import org.openqa.selenium.chrome.ChromeDriver;
10+
import org.openqa.selenium.chrome.ChromeOptions;
11+
import org.openqa.selenium.support.ui.WebDriverWait;
12+
import org.springframework.stereotype.Service;
13+
14+
import com.davcatch.devcatch.common.exception.CustomException;
15+
import com.davcatch.devcatch.common.exception.ErrorCode;
16+
import com.davcatch.devcatch.domain.source.Source;
17+
import com.rometools.rome.feed.synd.SyndFeed;
18+
import com.rometools.rome.io.SyndFeedInput;
19+
20+
import jakarta.annotation.PostConstruct;
21+
import lombok.extern.slf4j.Slf4j;
22+
23+
@Service
24+
@Slf4j
25+
public class SeleniumBrowserService {
26+
27+
private WebDriver createWebDriver() throws CustomException {
28+
try {
29+
ChromeOptions options = new ChromeOptions();
30+
options.addArguments("--headless=new");
31+
options.addArguments("--disable-gpu");
32+
options.addArguments("--no-sandbox");
33+
options.addArguments("--disable-dev-shm-usage");
34+
35+
options.addArguments("--disable-blink-features=AutomationControlled");
36+
options.setExperimentalOption("excludeSwitches", new String[]{"enable-automation"});
37+
options.setExperimentalOption("useAutomationExtension", false);
38+
39+
WebDriver webDriver = new ChromeDriver(options);
40+
41+
// 자동화 감지 방지 스크립트
42+
((JavascriptExecutor) webDriver).executeScript("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})");
43+
44+
log.info("셀레니움 헤드리스 브라우저 초기화 성공");
45+
return webDriver;
46+
} catch (Exception e) {
47+
log.error("셀레니움 헤드리스 브라우저 초기화 실패: {}", e.getMessage());
48+
throw new CustomException(ErrorCode.SERVER_ERROR);
49+
}
50+
}
51+
52+
public void destroyWebDriver(WebDriver webDriver) {
53+
if (webDriver != null) {
54+
try {
55+
webDriver.quit();
56+
log.info("셀레니움 헤드리스 브라우저 정상 종료");
57+
} catch (Exception e) {
58+
log.error("셀레니움 헤드리스 브라우저 종료 중 오류: {}", e.getMessage());
59+
}
60+
}
61+
}
62+
63+
public Optional<SyndFeed> reader(Source source) {
64+
65+
log.debug("[{}] 셀레니움 헤드리스 RSS FEED 수집 시작", source.getName());
66+
67+
WebDriver webDriver = null;
68+
try {
69+
webDriver = createWebDriver();
70+
71+
// 페이지 접근
72+
webDriver.get(source.getFeedUrl());
73+
74+
// 페이지 로딩 대기
75+
WebDriverWait wait = new WebDriverWait(webDriver, java.time.Duration.ofSeconds(10));
76+
wait.until(d -> ((JavascriptExecutor)d).executeScript("return document.readyState").equals("complete"));
77+
78+
// Cloudflare 우회 대기
79+
Thread.sleep(3000);
80+
81+
// 페이지 소스 가져오기
82+
String pageSource = webDriver.getPageSource();
83+
84+
if (pageSource == null || pageSource.isBlank()) {
85+
log.warn("[{}] 페이지 소스를 가져올 수 없습니다", source.getName());
86+
return Optional.empty();
87+
}
88+
89+
// XML 파싱
90+
SyndFeedInput syndFeedInput = new SyndFeedInput();
91+
syndFeedInput.setAllowDoctypes(true);
92+
syndFeedInput.setPreserveWireFeed(true);
93+
syndFeedInput.setXmlHealerOn(true); // XML 문법 오류 자동 복구 활성화
94+
95+
String parse = Jsoup.parse(pageSource).body().text();
96+
SyndFeed feed = syndFeedInput.build(new StringReader(parse));
97+
return Optional.of(feed);
98+
} catch (Exception e) {
99+
log.error("[{}] RSS FEED 수집중 오류 발생 : {}", source.getName(), e.getMessage());
100+
return Optional.empty();
101+
} finally {
102+
destroyWebDriver(webDriver);
103+
}
104+
}
105+
}

src/main/java/com/davcatch/devcatch/common/scheduler/article/ArticleSchedulerService.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import org.springframework.stereotype.Service;
88

9+
import com.davcatch.devcatch.common.integration.selenium.SeleniumBrowserService;
910
import com.davcatch.devcatch.domain.source.Source;
1011
import com.davcatch.devcatch.web.service.source.SourceService;
1112

@@ -20,24 +21,26 @@ public class ArticleSchedulerService {
2021
private final SourceService sourceService;
2122
private final ArticleSchedulerTask articleSchedulerTask;
2223
private final Executor schedulerTaskExecutor;
24+
private final SeleniumBrowserService seleniumBrowserService;
2325

2426
public void createNewArticle() {
2527
List<Source> sources = sourceService.getActiveSources();
2628
log.info("총 {}개 소스 처리 시작", sources.size());
2729

2830
int batchSize = 5;
2931

30-
for (int i=0; i<sources.size(); i+=batchSize) {
32+
for (int i = 0; i < sources.size(); i += batchSize) {
3133
int endIndex = Math.min(i + batchSize, sources.size());
3234
List<Source> batchSources = sources.subList(i, endIndex);
3335

3436
List<CompletableFuture<Void>> futures = batchSources.stream()
3537
.map(source -> CompletableFuture.runAsync(() -> {
36-
articleSchedulerTask.processSource(source);
38+
articleSchedulerTask.processSource(source);
3739
}, schedulerTaskExecutor))
3840
.toList();
3941

4042
CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join();
4143
}
44+
4245
}
4346
}

0 commit comments

Comments
 (0)