@@ -22,20 +22,22 @@ def get_text_chunks(page_titles, chunk_length=512, verbose=False):
2222 :return: list of wiki text chunks
2323 """
2424 wiki_chunks = []
25- with concurrent .futures .ThreadPoolExecutor () as executor :
26- future_to_page = {executor .submit (get_page_content , page_title ): page_title for page_title in page_titles }
27- for future in concurrent .futures .as_completed (future_to_page ):
28- page_title = future_to_page [future ]
29- try :
30- wiki_content = future .result ()
31- wiki_content = preprocess_and_chunk_wiki_content (wiki_content , chunk_length = chunk_length )
32- if verbose :
33- print (f"getting content of page { page_title } " )
34- wiki_chunks .extend (wiki_content )
35- except (wikipedia .exceptions .PageError , wikipedia .exceptions .DisambiguationError ):
36- if verbose :
37- print (f"page { page_title } not found" )
38- continue # skip the page if it is not available
25+ with warnings .catch_warnings ():
26+ warnings .filterwarnings ("ignore" , category = UserWarning )
27+ with concurrent .futures .ThreadPoolExecutor () as executor :
28+ future_to_page = {executor .submit (get_page_content , page_title ): page_title for page_title in page_titles }
29+ for future in concurrent .futures .as_completed (future_to_page ):
30+ page_title = future_to_page [future ]
31+ try :
32+ wiki_content = future .result ()
33+ wiki_content = preprocess_and_chunk_wiki_content (wiki_content , chunk_length = chunk_length )
34+ if verbose :
35+ print (f"getting content of page { page_title } " )
36+ wiki_chunks .extend (wiki_content )
37+ except (wikipedia .exceptions .PageError , wikipedia .exceptions .DisambiguationError ):
38+ if verbose :
39+ print (f"page { page_title } not found" )
40+ continue # skip the page if it is not available
3941 return wiki_chunks
4042
4143
@@ -45,10 +47,7 @@ def get_page_content(page_title):
4547 :param page_title: page_title of the wikipedia page from which the content should be extracted
4648 :return: content of the wikipedia page
4749 """
48- with warnings .catch_warnings ():
49- warnings .filterwarnings ("ignore" , category = UserWarning )
50- page_content = wikipedia .page (page_title , auto_suggest = False ).content
51- return page_content
50+ return wikipedia .page (page_title , auto_suggest = False ).content
5251
5352
5453def preprocess_and_chunk_wiki_content (wiki_content , chunk_length = 512 ):
0 commit comments