From 9b7b03b8f7d017b7c4341aeda7d461d5952db514 Mon Sep 17 00:00:00 2001 From: dmori Date: Fri, 2 Jan 2026 22:11:45 +0900 Subject: [PATCH 01/10] =?UTF-8?q?improve:=20SSL=20=EC=9D=B8=EC=A6=9D?= =?UTF-8?q?=EC=84=9C=20=EB=A7=8C=EB=A3=8C=20=EB=AC=B8=EC=A0=9C=EB=95=8C?= =?UTF-8?q?=EB=AC=B8=EC=97=90=20=EC=B6=94=EA=B0=80=ED=96=88=EB=8D=98=20?= =?UTF-8?q?=EB=AA=A8=EB=93=A0=20=EC=9D=B8=EC=A6=9D=EC=84=9C=20=EC=8B=A0?= =?UTF-8?q?=EB=A2=B0=20=EC=BD=94=EB=93=9C=EB=A5=BC=20MITM=20=EB=AC=B8?= =?UTF-8?q?=EC=A0=9C=EB=A5=BC=20=EB=A7=89=EA=B8=B0=20=EC=9C=84=ED=95=B4=20?= =?UTF-8?q?=EC=A0=9C=EA=B1=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../global/config/WebClientConfig.java | 47 ++++++------------- 1 file changed, 14 insertions(+), 33 deletions(-) diff --git a/src/main/java/com/techfork/global/config/WebClientConfig.java b/src/main/java/com/techfork/global/config/WebClientConfig.java index 6edcfae..6a07ee5 100644 --- a/src/main/java/com/techfork/global/config/WebClientConfig.java +++ b/src/main/java/com/techfork/global/config/WebClientConfig.java @@ -1,8 +1,5 @@ package com.techfork.global.config; -import io.netty.handler.ssl.SslContext; -import io.netty.handler.ssl.SslContextBuilder; -import io.netty.handler.ssl.util.InsecureTrustManagerFactory; import lombok.extern.slf4j.Slf4j; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; @@ -10,7 +7,6 @@ import org.springframework.web.reactive.function.client.WebClient; import reactor.netty.http.client.HttpClient; -import javax.net.ssl.SSLException; import java.time.Duration; @@ -20,34 +16,19 @@ public class WebClientConfig { @Bean public WebClient webClient() { - try { - // SSL Context 생성 - 모든 인증서 신뢰 - SslContext sslContext = SslContextBuilder - .forClient() - .trustManager(InsecureTrustManagerFactory.INSTANCE) - .build(); - - // HttpClient 설정 (Netty 기반) - HttpClient httpClient = HttpClient.create() - .secure(sslContextSpec -> sslContextSpec.sslContext(sslContext)) - .responseTimeout(Duration.ofSeconds(30)) - .followRedirect(true); // Redirect 자동 추적 - - // WebClient 생성 - WebClient webClient = WebClient.builder() - .clientConnector(new ReactorClientHttpConnector(httpClient)) - .defaultHeader("User-Agent", "Mozilla/5.0 (compatible; TechFork-Bot/1.0)") - .defaultHeader("Accept", "application/rss+xml, application/xml, application/atom+xml, text/xml, */*") - .codecs(configurer -> configurer - .defaultCodecs() - .maxInMemorySize(10 * 1024 * 1024)) - .build(); - - return webClient; - - } catch (SSLException e) { - log.error("WebClient 초기화 실패", e); - throw new RuntimeException("WebClient 초기화 실패", e); - } + // HttpClient 설정 (Netty 기반) + HttpClient httpClient = HttpClient.create() + .responseTimeout(Duration.ofSeconds(30)) + .followRedirect(true); // Redirect 자동 추적 + + // WebClient 생성 + return WebClient.builder() + .clientConnector(new ReactorClientHttpConnector(httpClient)) + .defaultHeader("User-Agent", "Mozilla/5.0 (compatible; TechFork-Bot/1.0)") + .defaultHeader("Accept", "application/rss+xml, application/xml, application/atom+xml, text/xml, */*") + .codecs(configurer -> configurer + .defaultCodecs() + .maxInMemorySize(10 * 1024 * 1024)) + .build(); } } From 3ffabc4600d9b1d3888510dd2f6196331283530c Mon Sep 17 00:00:00 2001 From: dmori Date: Fri, 2 Jan 2026 22:17:52 +0900 Subject: [PATCH 02/10] =?UTF-8?q?improve:=20=EC=97=B0=EA=B2=B0=20=ED=83=80?= =?UTF-8?q?=EC=9E=84=EC=95=84=EC=9B=83=20=EC=84=A4=EC=A0=95=20=EC=B6=94?= =?UTF-8?q?=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../com/techfork/global/config/WebClientConfig.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/main/java/com/techfork/global/config/WebClientConfig.java b/src/main/java/com/techfork/global/config/WebClientConfig.java index 6a07ee5..ad3d34e 100644 --- a/src/main/java/com/techfork/global/config/WebClientConfig.java +++ b/src/main/java/com/techfork/global/config/WebClientConfig.java @@ -1,5 +1,6 @@ package com.techfork.global.config; +import io.netty.channel.ChannelOption; import lombok.extern.slf4j.Slf4j; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; @@ -18,17 +19,18 @@ public class WebClientConfig { public WebClient webClient() { // HttpClient 설정 (Netty 기반) HttpClient httpClient = HttpClient.create() - .responseTimeout(Duration.ofSeconds(30)) + .option(ChannelOption.CONNECT_TIMEOUT_MILLIS, 10_000) // 연결 타임아웃 + .responseTimeout(Duration.ofSeconds(30)) // 응답 타임아웃 .followRedirect(true); // Redirect 자동 추적 // WebClient 생성 return WebClient.builder() .clientConnector(new ReactorClientHttpConnector(httpClient)) - .defaultHeader("User-Agent", "Mozilla/5.0 (compatible; TechFork-Bot/1.0)") - .defaultHeader("Accept", "application/rss+xml, application/xml, application/atom+xml, text/xml, */*") + .defaultHeader("User-Agent", "Mozilla/5.0 (compatible; TechFork-Bot/1.0)") // 봇 차단 방지 + .defaultHeader("Accept", "application/rss+xml, application/xml, application/atom+xml, text/xml, */*") // RSS/XML 콘텐츠 명시 .codecs(configurer -> configurer .defaultCodecs() - .maxInMemorySize(10 * 1024 * 1024)) + .maxInMemorySize(10 * 1024 * 1024)) // 큰 RSS 피드 처리 가능 .build(); } } From 4975d198b5882ac6c7333f10ed48c616366e4b66 Mon Sep 17 00:00:00 2001 From: dmori Date: Fri, 2 Jan 2026 22:38:56 +0900 Subject: [PATCH 03/10] =?UTF-8?q?improve:=20taskExecutor=EB=A1=9C=20?= =?UTF-8?q?=EB=B3=91=EB=A0=AC=ED=99=94=EA=B0=80=20=EC=9D=98=EB=AF=B8?= =?UTF-8?q?=EA=B0=80=20=EC=97=86=EC=9C=BC=EB=AF=80=EB=A1=9C=20=EB=8C=80?= =?UTF-8?q?=EC=8B=A0=20parallelStream()=EC=9C=BC=EB=A1=9C=20=ED=85=8C?= =?UTF-8?q?=ED=81=AC=20=EB=B8=94=EB=A1=9C=EA=B7=B8=EB=A7=88=EB=8B=A4=20?= =?UTF-8?q?=EB=B3=91=EB=A0=AC=ED=99=94?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../domain/source/batch/RssFeedReader.java | 38 +++++++++---------- .../source/config/RssCrawlingJobConfig.java | 15 -------- 2 files changed, 17 insertions(+), 36 deletions(-) diff --git a/src/main/java/com/techfork/domain/source/batch/RssFeedReader.java b/src/main/java/com/techfork/domain/source/batch/RssFeedReader.java index 9aaaa04..4732b92 100644 --- a/src/main/java/com/techfork/domain/source/batch/RssFeedReader.java +++ b/src/main/java/com/techfork/domain/source/batch/RssFeedReader.java @@ -22,6 +22,7 @@ import java.util.Date; import java.util.List; import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.stream.Stream; @Slf4j @Component @@ -57,32 +58,27 @@ public RssFeedItem read() { */ private synchronized void initializeQueue() { // Double-checked locking - if (itemQueue != null) { - return; - } + if (itemQueue != null) return; itemQueue = new ConcurrentLinkedQueue<>(); List techBlogs = techBlogRepository.findAll(); log.info("총 {}개 테크 블로그 RSS 수집 시작", techBlogs.size()); - int totalItems = 0; - for (TechBlog techBlog : techBlogs) { - try { - List items = fetchRssFeed(techBlog); - if (!items.isEmpty()) { - itemQueue.addAll(items); - totalItems += items.size(); - log.info("[{}] RSS 수집 성공: {}개 아이템", techBlog.getCompanyName(), items.size()); - } else { - log.warn("[{}] RSS 피드에 아이템이 없습니다", techBlog.getCompanyName()); - } - } catch (Exception e) { - log.error("[{}] RSS 수집 실패: {}", techBlog.getCompanyName(), e.getMessage(), e); - // 실패해도 다음 블로그 계속 처리 - } - } - - log.info("RSS 수집 초기화 완료: 총 {}개 아이템을 큐에 추가", totalItems); + List allItems = techBlogs.parallelStream() + .flatMap(techBlog -> { + try { + List items = fetchRssFeed(techBlog); + log.info("[{}] RSS 수집 성공: {}개", techBlog.getCompanyName(), items.size()); + return items.stream(); + } catch (Exception e) { + log.error("[{}] RSS 수집 실패: {}", techBlog.getCompanyName(), e.getMessage()); + return Stream.empty(); + } + }) + .toList(); + + itemQueue.addAll(allItems); + log.info("RSS 수집 초기화 완료: 총 {}개 아이템을 큐에 추가", allItems.size()); } private List fetchRssFeed(TechBlog techBlog) throws Exception { diff --git a/src/main/java/com/techfork/domain/source/config/RssCrawlingJobConfig.java b/src/main/java/com/techfork/domain/source/config/RssCrawlingJobConfig.java index 3c18b14..10bc549 100644 --- a/src/main/java/com/techfork/domain/source/config/RssCrawlingJobConfig.java +++ b/src/main/java/com/techfork/domain/source/config/RssCrawlingJobConfig.java @@ -74,8 +74,6 @@ public Step fetchAndSaveRssStep() { .reader(rssFeedReader) .processor(rssToPostProcessor) .writer(postBatchWriter) - // 병렬 처리: 5개 스레드로 동시에 RSS 수집 - .taskExecutor(rssTaskExecutor()) .faultTolerant() // 건너뛰기 정책: 최대 10개 아이템까지 건너뛰기 허용 .skipLimit(10) @@ -118,19 +116,6 @@ public Step embedAndIndexStep() { .build(); } - @Bean - public TaskExecutor rssTaskExecutor() { - ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor(); - executor.setCorePoolSize(5); - executor.setMaxPoolSize(10); - executor.setQueueCapacity(20); - executor.setThreadNamePrefix("rss-crawl-"); - executor.setWaitForTasksToCompleteOnShutdown(true); - executor.setAwaitTerminationSeconds(60); - executor.initialize(); - return executor; - } - @Bean public TaskExecutor embeddingTaskExecutor() { ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor(); From c56436e2ad5c462d2a58682c0567def4a715f0a8 Mon Sep 17 00:00:00 2001 From: dmori Date: Fri, 2 Jan 2026 22:50:23 +0900 Subject: [PATCH 04/10] =?UTF-8?q?refactor:=20taskExecutor=EB=A5=BC=20?= =?UTF-8?q?=EC=A0=9C=EA=B1=B0=ED=95=A8=EC=97=90=20=EB=94=B0=EB=9D=BC=20Rea?= =?UTF-8?q?der=EC=97=90=EC=84=9C=20=EB=8B=A8=EC=88=9C=20List=EB=A5=BC=20?= =?UTF-8?q?=EC=82=AC=EC=9A=A9=ED=95=98=EB=8F=84=EB=A1=9D=20=EB=B3=80?= =?UTF-8?q?=EA=B2=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../domain/source/batch/RssFeedReader.java | 39 +++++++------------ 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/src/main/java/com/techfork/domain/source/batch/RssFeedReader.java b/src/main/java/com/techfork/domain/source/batch/RssFeedReader.java index 4732b92..c7337b9 100644 --- a/src/main/java/com/techfork/domain/source/batch/RssFeedReader.java +++ b/src/main/java/com/techfork/domain/source/batch/RssFeedReader.java @@ -33,43 +33,33 @@ public class RssFeedReader implements ItemReader { private final TechBlogRepository techBlogRepository; private final WebClient webClient; - private ConcurrentLinkedQueue itemQueue; + private List items; + private int currentIndex = 0; @Override public RssFeedItem read() { - // 첫 실행 시 모든 RSS 아이템을 큐에 추가 - if (itemQueue == null) { - initializeQueue(); + if (items == null) { + initializeItems(); } - // 큐에서 아이템 꺼내기 (Thread-Safe) - RssFeedItem item = itemQueue.poll(); - - if (item == null) { - log.info("모든 RSS 피드 수집 완료"); + if (currentIndex >= items.size()) { + log.info("모든 RSS 피드 수집 완료: 총 {}개", items.size()); + return null; } - return item; + return items.get(currentIndex++); } - /** - * 모든 RSS 피드를 미리 수집하여 큐에 저장 - * 한 번만 실행되며, 여러 스레드가 큐에서 안전하게 아이템을 가져감 - */ - private synchronized void initializeQueue() { - // Double-checked locking - if (itemQueue != null) return; - - itemQueue = new ConcurrentLinkedQueue<>(); + private void initializeItems() { List techBlogs = techBlogRepository.findAll(); log.info("총 {}개 테크 블로그 RSS 수집 시작", techBlogs.size()); - List allItems = techBlogs.parallelStream() + items = techBlogs.parallelStream() .flatMap(techBlog -> { try { - List items = fetchRssFeed(techBlog); - log.info("[{}] RSS 수집 성공: {}개", techBlog.getCompanyName(), items.size()); - return items.stream(); + List feedItems = fetchRssFeed(techBlog); + log.info("[{}] RSS 수집 성공: {}개", techBlog.getCompanyName(), feedItems.size()); + return feedItems.stream(); } catch (Exception e) { log.error("[{}] RSS 수집 실패: {}", techBlog.getCompanyName(), e.getMessage()); return Stream.empty(); @@ -77,8 +67,7 @@ private synchronized void initializeQueue() { }) .toList(); - itemQueue.addAll(allItems); - log.info("RSS 수집 초기화 완료: 총 {}개 아이템을 큐에 추가", allItems.size()); + log.info("RSS 수집 초기화 완료: 총 {}개 아이템", items.size()); } private List fetchRssFeed(TechBlog techBlog) throws Exception { From 29ec2d8ab4e3768b7fed4f0620664d24544f0d39 Mon Sep 17 00:00:00 2001 From: dmori Date: Fri, 2 Jan 2026 22:59:12 +0900 Subject: [PATCH 05/10] =?UTF-8?q?docs:=20=ED=81=AC=EB=A1=A4=EB=A7=81=20?= =?UTF-8?q?=EC=B2=98=EB=A6=AC=20=EC=A4=91=20=EC=A4=91=EB=B3=B5=20url=20?= =?UTF-8?q?=EB=A1=9C=EA=B7=B8=20=EC=A0=9C=EA=B1=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../com/techfork/domain/source/batch/RssToPostProcessor.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/java/com/techfork/domain/source/batch/RssToPostProcessor.java b/src/main/java/com/techfork/domain/source/batch/RssToPostProcessor.java index 83ec98e..d4624e8 100644 --- a/src/main/java/com/techfork/domain/source/batch/RssToPostProcessor.java +++ b/src/main/java/com/techfork/domain/source/batch/RssToPostProcessor.java @@ -26,9 +26,7 @@ public class RssToPostProcessor implements ItemProcessor { @Override public Post process(RssFeedItem item) { - // 중복 체크 if (postRepository.existsByUrl(item.url())) { - log.debug("중복 URL 스킵: {}", item.url()); return null; // null 반환 시 Writer에서 처리 안 함 } From af73bee225d9cd9e15ef60efb91a32d4caea4e4a Mon Sep 17 00:00:00 2001 From: dmori Date: Fri, 2 Jan 2026 23:00:31 +0900 Subject: [PATCH 06/10] =?UTF-8?q?refactor:=20TechBlog=EB=8A=94=20=EB=B0=98?= =?UTF-8?q?=EB=93=9C=EC=8B=9C=20=EC=A1=B4=EC=9E=AC=ED=95=98=EA=B3=A0,=20?= =?UTF-8?q?=EB=94=B0=EB=A1=9C=20TechBlog=20=EC=97=94=ED=8B=B0=ED=8B=B0?= =?UTF-8?q?=EC=9D=98=20id=EA=B0=92=EC=9D=84=20=EC=A0=9C=EC=99=B8=ED=95=9C?= =?UTF-8?q?=20=ED=95=84=EB=93=9C=EB=93=A4=EC=9D=84=20=EC=82=AC=EC=9A=A9?= =?UTF-8?q?=ED=95=98=EC=A7=80=20=EC=95=8A=EC=9C=BC=EB=AF=80=EB=A1=9C=20get?= =?UTF-8?q?ReferenceId=EB=A1=9C=20=EC=B5=9C=EC=A0=81=ED=99=94?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../com/techfork/domain/source/batch/RssToPostProcessor.java | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/main/java/com/techfork/domain/source/batch/RssToPostProcessor.java b/src/main/java/com/techfork/domain/source/batch/RssToPostProcessor.java index d4624e8..2656820 100644 --- a/src/main/java/com/techfork/domain/source/batch/RssToPostProcessor.java +++ b/src/main/java/com/techfork/domain/source/batch/RssToPostProcessor.java @@ -30,10 +30,7 @@ public Post process(RssFeedItem item) { return null; // null 반환 시 Writer에서 처리 안 함 } - TechBlog techBlog = techBlogRepository.findById(item.techBlogId()) - .orElseThrow(() -> new IllegalStateException( - "TechBlog를 찾을 수 없습니다. ID: " + item.techBlogId())); - + TechBlog techBlog = techBlogRepository.getReferenceById(item.techBlogId()); return Post.create(item, techBlog); } } From 97090403ec3b0478266a7cd01bacda1906adba84 Mon Sep 17 00:00:00 2001 From: dmori Date: Fri, 2 Jan 2026 23:06:49 +0900 Subject: [PATCH 07/10] =?UTF-8?q?chore:=20=EC=82=AC=EC=9A=A9=ED=95=98?= =?UTF-8?q?=EC=A7=80=20=EC=95=8A=EB=8A=94=20import=EB=AC=B8=20=EC=A0=9C?= =?UTF-8?q?=EA=B1=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/com/techfork/domain/source/batch/RssFeedReader.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/main/java/com/techfork/domain/source/batch/RssFeedReader.java b/src/main/java/com/techfork/domain/source/batch/RssFeedReader.java index c7337b9..4ea8a6e 100644 --- a/src/main/java/com/techfork/domain/source/batch/RssFeedReader.java +++ b/src/main/java/com/techfork/domain/source/batch/RssFeedReader.java @@ -21,7 +21,6 @@ import java.time.ZoneId; import java.util.Date; import java.util.List; -import java.util.concurrent.ConcurrentLinkedQueue; import java.util.stream.Stream; @Slf4j @@ -150,4 +149,4 @@ private LocalDateTime convertToLocalDateTime(Date date) { .atZone(ZoneId.systemDefault()) .toLocalDateTime(); } -} +} \ No newline at end of file From 4822ebc04ad03fe421c699ff48216dc538f0b4ef Mon Sep 17 00:00:00 2001 From: dmori Date: Fri, 2 Jan 2026 23:38:31 +0900 Subject: [PATCH 08/10] =?UTF-8?q?improve:=20=EC=A4=91=EB=B3=B5=20=EC=B2=98?= =?UTF-8?q?=EB=A6=AC=EB=A5=BC=20Reader=EC=97=90=EC=84=9C=20=EB=AF=B8?= =?UTF-8?q?=EB=A6=AC=20=ED=95=84=ED=84=B0=EB=A7=81=20=ED=95=98=EB=8A=94=20?= =?UTF-8?q?=EB=B0=A9=EC=8B=9D=EC=9C=BC=EB=A1=9C=20=EB=B3=80=EA=B2=BD=20(25?= =?UTF-8?q?=EC=B4=88=20->=2012=EC=B4=88=EB=A1=9C=20=EA=B0=9C=EC=84=A0?= =?UTF-8?q?=EB=90=A8,=20url=EC=9D=80=20unique=20=EC=86=8D=EC=84=B1?= =?UTF-8?q?=EC=9D=B4=EB=AF=80=EB=A1=9C=20=EC=9D=B8=EB=8D=B1=EC=8A=A4=20?= =?UTF-8?q?=ED=83=80=EC=84=9C=20post=20=ED=85=8C=EC=9D=B4=EB=B8=94?= =?UTF-8?q?=EC=9D=B4=20=EC=BB=A4=EC=A0=B8=EB=8F=84=20=EA=B4=9C=EC=B0=AE?= =?UTF-8?q?=EC=9D=8C)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../domain/post/repository/PostRepository.java | 5 +++-- .../techfork/domain/source/batch/RssFeedReader.java | 13 ++++++++++++- .../domain/source/batch/RssToPostProcessor.java | 7 ------- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/main/java/com/techfork/domain/post/repository/PostRepository.java b/src/main/java/com/techfork/domain/post/repository/PostRepository.java index e175897..afeca71 100644 --- a/src/main/java/com/techfork/domain/post/repository/PostRepository.java +++ b/src/main/java/com/techfork/domain/post/repository/PostRepository.java @@ -13,10 +13,11 @@ import java.time.LocalDateTime; import java.util.List; import java.util.Optional; +import java.util.Set; public interface PostRepository extends JpaRepository { - - boolean existsByUrl(String url); + @Query("SELECT p.url FROM Post p WHERE p.url IN :urls") + Set findExistingUrls(@Param("urls") List urls); @Query(""" SELECT p FROM Post p diff --git a/src/main/java/com/techfork/domain/source/batch/RssFeedReader.java b/src/main/java/com/techfork/domain/source/batch/RssFeedReader.java index 4ea8a6e..374bb78 100644 --- a/src/main/java/com/techfork/domain/source/batch/RssFeedReader.java +++ b/src/main/java/com/techfork/domain/source/batch/RssFeedReader.java @@ -4,6 +4,7 @@ import com.rometools.rome.feed.synd.SyndFeed; import com.rometools.rome.io.SyndFeedInput; import com.rometools.rome.io.XmlReader; +import com.techfork.domain.post.repository.PostRepository; import com.techfork.domain.source.dto.RssFeedItem; import com.techfork.domain.source.entity.TechBlog; import com.techfork.domain.source.repository.TechBlogRepository; @@ -21,6 +22,7 @@ import java.time.ZoneId; import java.util.Date; import java.util.List; +import java.util.Set; import java.util.stream.Stream; @Slf4j @@ -30,6 +32,7 @@ public class RssFeedReader implements ItemReader { private final TechBlogRepository techBlogRepository; + private final PostRepository postRepository; private final WebClient webClient; private List items; @@ -53,7 +56,7 @@ private void initializeItems() { List techBlogs = techBlogRepository.findAll(); log.info("총 {}개 테크 블로그 RSS 수집 시작", techBlogs.size()); - items = techBlogs.parallelStream() + List allItems = techBlogs.parallelStream() .flatMap(techBlog -> { try { List feedItems = fetchRssFeed(techBlog); @@ -66,6 +69,14 @@ private void initializeItems() { }) .toList(); + Set existingUrls = postRepository.findExistingUrls( + allItems.stream().map(RssFeedItem::url).toList() + ); + + items = allItems.stream() + .filter(item -> !existingUrls.contains(item.url())) + .toList(); + log.info("RSS 수집 초기화 완료: 총 {}개 아이템", items.size()); } diff --git a/src/main/java/com/techfork/domain/source/batch/RssToPostProcessor.java b/src/main/java/com/techfork/domain/source/batch/RssToPostProcessor.java index 2656820..9d716c7 100644 --- a/src/main/java/com/techfork/domain/source/batch/RssToPostProcessor.java +++ b/src/main/java/com/techfork/domain/source/batch/RssToPostProcessor.java @@ -1,7 +1,6 @@ package com.techfork.domain.source.batch; import com.techfork.domain.post.entity.Post; -import com.techfork.domain.post.repository.PostRepository; import com.techfork.domain.source.dto.RssFeedItem; import com.techfork.domain.source.entity.TechBlog; import com.techfork.domain.source.repository.TechBlogRepository; @@ -13,7 +12,6 @@ /** * RssFeedItem을 Post 엔티티로 변환하는 Processor - * 중복 체크도 여기서 수행하여 이미 존재하는 URL은 null 반환 */ @Slf4j @Component @@ -21,15 +19,10 @@ @RequiredArgsConstructor public class RssToPostProcessor implements ItemProcessor { - private final PostRepository postRepository; private final TechBlogRepository techBlogRepository; @Override public Post process(RssFeedItem item) { - if (postRepository.existsByUrl(item.url())) { - return null; // null 반환 시 Writer에서 처리 안 함 - } - TechBlog techBlog = techBlogRepository.getReferenceById(item.techBlogId()); return Post.create(item, techBlog); } From e1fd6d339fc96cbbe5a42ec81ff95d82d1fb6640 Mon Sep 17 00:00:00 2001 From: dmori Date: Fri, 2 Jan 2026 23:44:27 +0900 Subject: [PATCH 09/10] =?UTF-8?q?improve:=20=EC=A2=80=EB=B9=84=20=ED=94=84?= =?UTF-8?q?=EB=A1=9C=EC=84=B8=EC=8A=A4=20=EC=A0=95=EB=A6=AC=EB=A5=BC=205?= =?UTF-8?q?=EB=B6=84=EB=A7=88=EB=8B=A4=EA=B0=80=20=EC=95=84=EB=8B=8C=20?= =?UTF-8?q?=ED=81=AC=EB=A1=A4=EB=A7=81=EC=9D=B4=20=EB=81=9D=EB=82=9C=20?= =?UTF-8?q?=ED=9B=84=20=EC=8B=A4=ED=96=89=ED=95=98=EB=8F=84=EB=A1=9D=20?= =?UTF-8?q?=EB=B3=80=EA=B2=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../source/scheduler/RssCrawlingScheduler.java | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/main/java/com/techfork/domain/source/scheduler/RssCrawlingScheduler.java b/src/main/java/com/techfork/domain/source/scheduler/RssCrawlingScheduler.java index 8385cf3..79b0e7d 100644 --- a/src/main/java/com/techfork/domain/source/scheduler/RssCrawlingScheduler.java +++ b/src/main/java/com/techfork/domain/source/scheduler/RssCrawlingScheduler.java @@ -31,7 +31,7 @@ public class RssCrawlingScheduler { /** * 매일 오전 5시마다 RSS 크롤링 실행 - * cron: 0 0 5 * * * -> 매 시간 정각 + * cron: 0 0 5 * * * -> 매일 오전 5시 */ @Scheduled(cron = "0 0 5 * * *") public void scheduleCrawling() { @@ -50,17 +50,11 @@ public void scheduleCrawling() { log.error("Unexpected error during scheduled crawling", e); } finally { distributedLock.unlock(CRAWLING_LOCK_KEY, lockValue); + cleanupStaleHistories(); } } - /** - * 5분마다 오래된 RUNNING 상태의 이력을 정리 (좀비 프로세스 방지) - * cron: 매 5분마다 실행 - */ - @Scheduled(cron = "0 */5 * * * *") - public void cleanupStaleHistories() { - log.debug("Checking for stale crawling histories"); - + private void cleanupStaleHistories() { var staleHistories = crawlingHistoryRepository.findByStatusAndStartedAtBefore( ECrawlingStatus.RUNNING, java.time.LocalDateTime.now().minusHours(1) ); From ea58d30f32eea133b4399e1156ef6d82f5e59953 Mon Sep 17 00:00:00 2001 From: dmori Date: Fri, 2 Jan 2026 23:45:12 +0900 Subject: [PATCH 10/10] =?UTF-8?q?docs:=20=EC=A0=95=EC=B1=85=20=EB=B3=80?= =?UTF-8?q?=EA=B2=BD=EC=97=90=20=EB=94=B0=EB=A5=B8=20=ED=81=AC=EB=A1=A4?= =?UTF-8?q?=EB=A7=81=20=EC=A3=BC=EC=84=9D=20=EC=97=85=EB=8D=B0=EC=9D=B4?= =?UTF-8?q?=ED=8A=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../techfork/domain/source/scheduler/RssCrawlingScheduler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/techfork/domain/source/scheduler/RssCrawlingScheduler.java b/src/main/java/com/techfork/domain/source/scheduler/RssCrawlingScheduler.java index 79b0e7d..06ae791 100644 --- a/src/main/java/com/techfork/domain/source/scheduler/RssCrawlingScheduler.java +++ b/src/main/java/com/techfork/domain/source/scheduler/RssCrawlingScheduler.java @@ -14,7 +14,7 @@ /** * RSS 크롤링 스케줄러 - * - 1시간마다 RSS 피드 크롤링 실행 + * - 24시간마다 RSS 피드 크롤링 실행 * - Redis 분산 락으로 중복 실행 방지 */ @Slf4j