11package com .davcatch .devcatch .common .scheduler .article .parser .strategy ;
22
3- import java .util .ArrayList ;
43import java .util .List ;
54import java .util .Set ;
65
76import org .jsoup .nodes .Document ;
87import org .springframework .stereotype .Component ;
98
109import com .davcatch .devcatch .common .exception .CustomException ;
11- import com .davcatch .devcatch .common .integration .selenium .SeleniumBrowserService ;
12- import com .davcatch .devcatch .domain .source .ParseMethod ;
13- import com .davcatch .devcatch .domain .source .Source ;
10+ import com .davcatch .devcatch .common .exception .ErrorCode ;
1411import com .davcatch .devcatch .common .integration .crawling .WebCrawler ;
1512import com .davcatch .devcatch .common .integration .rss .RssReaderService ;
13+ import com .davcatch .devcatch .common .integration .selenium .SeleniumBrowserService ;
1614import com .davcatch .devcatch .common .scheduler .article .dto .ParsedArticle ;
1715import com .davcatch .devcatch .common .scheduler .article .extractor .factory .ContentExtractorFactory ;
1816import com .davcatch .devcatch .common .scheduler .article .extractor .strategy .ContentExtractorStrategy ;
17+ import com .davcatch .devcatch .domain .source .ParseMethod ;
18+ import com .davcatch .devcatch .domain .source .Source ;
1919import com .rometools .rome .feed .synd .SyndEntry ;
2020
2121import lombok .extern .slf4j .Slf4j ;
@@ -33,24 +33,19 @@ public CrawlingParseStrategy(RssReaderService rssReaderService, SeleniumBrowserS
3333 }
3434
3535 @ Override
36- public List <ParsedArticle > process (Source source ) throws CustomException {
37- ContentExtractorStrategy extractor = getContentExtractor (source .getParseMethod ());
38- List <SyndEntry > entries = getEntries (source );
39-
40- List <ParsedArticle > parsedArticles = new ArrayList <>();
41- for (int i = 0 ; i < Math .min (MAX_PARSE_PAGE , entries .size ()) ; i ++) {
42- SyndEntry entry = entries .get (i );
43-
44- String link = source .isUseLink () ? entry .getLink () : entry .getUri ();
45- Document document = webCrawler .getDocument (link ).orElse (null );
46- if (document == null )
47- continue ;
36+ protected List <SyndEntry > fetchEntries (Source source ) {
37+ List <SyndEntry > entries = getEntriesFromRss (source );
38+ return entries .subList (0 , Math .min (MAX_PARSE_PAGE , entries .size ()));
39+ }
4840
49- String content = extractor .extractContent (null , document );
50- parsedArticles .add (ParsedArticle .of (entry , source , content ));
51- }
41+ @ Override
42+ protected ParsedArticle processEntry (SyndEntry entry , Source source , ContentExtractorStrategy contentExtractor ) throws CustomException {
43+ String link = source .isUseLink () ? entry .getLink () : entry .getUri ();
44+ Document document = webCrawler .getDocument (link )
45+ .orElseThrow (() -> new CustomException (ErrorCode .CONTENT_PARSE_ERROR ));
5246
53- return parsedArticles ;
47+ String content = contentExtractor .extractContent (null , document );
48+ return ParsedArticle .of (entry , source , content );
5449 }
5550
5651 @ Override
0 commit comments