knowledge-vector-service/cms-scrape.py at main · MicroHealthLLC/knowledge-vector-service · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import requests
from bs4 import BeautifulSoup
import os
import time
from urllib.parse import urljoin, urlparse
import re

class CMSManualDownloader:
    def __init__(self, output_folder="cms_manuals_pdfs"):
        self.base_url = "https://www.cms.gov"
        self.main_page = "https://www.cms.gov/medicare/regulations-guidance/manuals/internet-only-manuals-ioms"
        self.output_folder = output_folder
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })

        # Create output folder if it doesn't exist
        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder)
            print(f"Created folder: {self.output_folder}")

    def get_publication_links(self):
        """Extract all publication page links from the main IOM page"""
        print("Fetching main page...")
        try:
            response = self.session.get(self.main_page, timeout=30)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find all publication links (adjust selector based on actual HTML structure)
            publication_links = []

            # Look for links in the table or list
            for link in soup.find_all('a', href=True):
                href = link['href']
                # Match publication number patterns like /cms050111, /cms012345, etc.
                if re.search(r'/cms\d+', href) or 'internet-only-manuals-ioms-items' in href:
                    full_url = urljoin(self.base_url, href)
                    if full_url not in publication_links:
                        publication_links.append(full_url)
                        print(f"Found publication: {full_url}")

            print(f"\nTotal publications found: {len(publication_links)}")
            return publication_links

        except Exception as e:
            print(f"Error fetching main page: {e}")
            return []

    def get_pdf_links(self, publication_url):
        """Extract all PDF download links from a publication page"""
        print(f"\nFetching PDFs from: {publication_url}")
        try:
            response = self.session.get(publication_url, timeout=30)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            pdf_links = []

            # Find all links that contain 'download' or end with .pdf
            for link in soup.find_all('a', href=True):
                href = link['href']
                if 'download' in href.lower() or href.lower().endswith('.pdf'):
                    full_url = urljoin(self.base_url, href)
                    if full_url.lower().endswith('.pdf'):
                        pdf_links.append(full_url)
                        print(f"  Found PDF: {os.path.basename(full_url)}")

            return pdf_links

        except Exception as e:
            print(f"Error fetching publication page {publication_url}: {e}")
            return []

    def download_pdf(self, pdf_url):
        """Download a single PDF file"""
        try:
            # Extract filename from URL
            filename = os.path.basename(urlparse(pdf_url).path)
            filepath = os.path.join(self.output_folder, filename)

            # Skip if file already exists
            if os.path.exists(filepath):
                print(f"  Skipping (already exists): {filename}")
                return True

            print(f"  Downloading: {filename}")
            response = self.session.get(pdf_url, timeout=60, stream=True)
            response.raise_for_status()

            # Write file in chunks
            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)

            print(f"  ✓ Downloaded: {filename}")
            return True

        except Exception as e:
            print(f"  ✗ Error downloading {pdf_url}: {e}")
            return False

    def run(self):
        """Main execution method"""
        print("=" * 60)
        print("CMS Medicare Manuals PDF Downloader")
        print("=" * 60)

        # Step 1: Get all publication links
        publication_links = self.get_publication_links()

        if not publication_links:
            print("No publications found. Please check the website structure.")
            return

        # Step 2: Process each publication
        total_pdfs = 0
        downloaded_pdfs = 0

        for i, pub_url in enumerate(publication_links, 1):
            print(f"\n[{i}/{len(publication_links)}] Processing publication...")

            # Get PDF links from this publication
            pdf_links = self.get_pdf_links(pub_url)
            total_pdfs += len(pdf_links)

            # Download each PDF
            for pdf_url in pdf_links:
                if self.download_pdf(pdf_url):
                    downloaded_pdfs += 1

                # Be respectful - add delay between downloads
                time.sleep(1)

            # Delay between publications
            time.sleep(2)

        # Summary
        print("\n" + "=" * 60)
        print(f"Download Complete!")
        print(f"Total PDFs found: {total_pdfs}")
        print(f"Successfully downloaded: {downloaded_pdfs}")
        print(f"Output folder: {os.path.abspath(self.output_folder)}")
        print("=" * 60)

# Run the scraper
if __name__ == "__main__":
    downloader = CMSManualDownloader(output_folder="cms_manuals_pdfs")
    downloader.run()