summaryrefslogtreecommitdiff
path: root/py/webget.py
blob: b71d87acc2976db9bda5ff6d6e140620e675666c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
import time
from pathlib import Path
import re
import sys

class WebpageDownloader:
    def __init__(self, base_url, output_dir='downloaded_pages', ignore_navigation=True):
        self.base_url = base_url
        self.domain = urlparse(base_url).netloc
        self.output_dir = output_dir
        self.assets_dir = os.path.join(output_dir, 'assets')
        self.css_dir = os.path.join(self.assets_dir, 'css')
        self.downloaded_urls = set()
        self.downloaded_assets = {}  # Maps original URLs to local paths
        self.ignore_navigation = ignore_navigation
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })

        # Common navigation elements to ignore
        self.nav_classes = {
            'nav', 'navbar', 'navigation', 'menu', 'sidebar', 'header', 'footer',
            'topbar', 'top-bar', 'site-nav', 'main-nav', 'primary-nav',
            'secondary-nav', 'breadcrumb', 'pagination'
        }
        self.nav_ids = self.nav_classes

        # Create necessary directories
        os.makedirs(self.css_dir, exist_ok=True)

    def clean_filename(self, url):
        """Convert URL to a valid filename."""
        # Extract the path and query components
        parsed = urlparse(url)
        path = parsed.path
        
        if not path or path == '/':
            path = 'index'
        else:
            path = path.strip('/')
            
        # Handle query parameters by appending them as a hash
        if parsed.query:
            query_hash = hashlib.md5(parsed.query.encode()).hexdigest()[:8]
            path = f"{path}_{query_hash}"
        
        # Clean the filename
        filename = re.sub(r'[<>:"/\\|?*]', '_', path)
        
        return filename

    def download_resource(self, url):
        """Download a resource (webpage or asset) and return its content."""
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            return response.content
        except requests.RequestException as e:
            print(f"Error downloading {url}: {e}")
            return None

    def download_and_save_css(self, url, base_url):
        """Download a CSS file and save it locally."""
        if url in self.downloaded_assets:
            return self.downloaded_assets[url]

        # Handle data URLs
        if url.startswith('data:'):
            return url

        # Convert relative URLs to absolute
        absolute_url = urljoin(base_url, url)
        
        # Only download from same domain
        if urlparse(absolute_url).netloc != self.domain:
            return url

        content = self.download_resource(absolute_url)
        if not content:
            return url

        # Generate filename for CSS
        css_filename = self.clean_filename(absolute_url)
        if not css_filename.endswith('.css'):
            css_filename += '.css'

        css_path = os.path.join(self.css_dir, css_filename)
        
        # Save the CSS file
        with open(css_path, 'wb') as f:
            f.write(content)

        # Store the relative path from HTML to CSS
        relative_path = os.path.relpath(css_path, self.output_dir)
        self.downloaded_assets[url] = relative_path
        return relative_path

    def process_css_urls(self, css_content, base_url):
        """Process and update URLs within CSS content."""
        def replace_url(match):
            url = match.group(1).strip('"\'')
            if url.startswith(('data:', 'http:', 'https:')):
                return f"url({url})"
            absolute_url = urljoin(base_url, url)
            return f"url({absolute_url})"

        # Replace URLs in CSS
        return re.sub(r'url\((.*?)\)', replace_url, css_content.decode('utf-8'))

    def save_page(self, content, url):
        """Save the webpage content to a file and process its CSS."""
        if not content:
            return

        soup = BeautifulSoup(content, 'html.parser')

        # Process external stylesheets
        for link in soup.find_all('link', rel='stylesheet'):
            if 'href' in link.attrs:
                css_path = self.download_and_save_css(link['href'], url)
                link['href'] = css_path

        # Process inline styles
        for style in soup.find_all('style'):
            if style.string:
                style.string = self.process_css_urls(style.string, url)

        # Save the processed HTML
        filename = self.clean_filename(url)
        if not filename.endswith('.html'):
            filename += '.html'
            
        filepath = os.path.join(self.output_dir, filename)
        
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(str(soup))
        print(f"Saved: {filepath}")

    def is_navigation_element(self, element):
        """Check if an element is likely part of navigation."""
        if not element:
            return False

        current = element
        while current:
            if current.get('class'):
                if any(nav_class in ' '.join(current['class']).lower() 
                      for nav_class in self.nav_classes):
                    return True

            if current.get('id'):
                if current['id'].lower() in self.nav_ids:
                    return True

            if current.get('role'):
                if current['role'].lower() in {'navigation', 'menu', 'menubar'}:
                    return True

            if current.name in {'nav', 'header', 'footer'}:
                return True

            current = current.parent

        return False

    def extract_links(self, content, current_url):
        """Extract all same-domain links from the page."""
        soup = BeautifulSoup(content, 'html.parser')
        links = set()
        
        main_content = soup.find(['main', 'article']) or soup.find(class_='content') or soup.find(id='content')
        
        for a in (main_content or soup).find_all('a', href=True):
            if self.ignore_navigation and self.is_navigation_element(a):
                continue
                
            href = a['href']
            if href.startswith('#'):
                continue
                
            absolute_url = urljoin(current_url, href)
            
            if urlparse(absolute_url).netloc == self.domain:
                links.add(absolute_url)
                
        return links

    def download_site(self, max_pages=50):
        """Download the website and its linked pages."""
        urls_to_process = {self.base_url}
        pages_downloaded = 0
        
        while urls_to_process and pages_downloaded < max_pages:
            current_url = urls_to_process.pop()
            
            if current_url in self.downloaded_urls:
                continue
                
            print(f"Downloading: {current_url}")
            content = self.download_resource(current_url)
            
            if content:
                self.save_page(content, current_url)
                self.downloaded_urls.add(current_url)
                pages_downloaded += 1
                
                new_links = self.extract_links(content, current_url)
                urls_to_process.update(new_links - self.downloaded_urls)
                
            time.sleep(1)
            
        print(f"\nDownload complete! Downloaded {pages_downloaded} pages to {self.output_dir}")

if __name__ == "__main__":
    
    # Example usage
    url = "https://www.fractalpress.com"  # Replace with your target URL
    if len(sys.argv) > 1:
        url = sys.argv[1]

    # Create downloader with navigation filtering enabled
    downloader = WebpageDownloader(
        url,
        output_dir='downloaded_pages',
        ignore_navigation=True  # Set to False to include navigation links
    )
    
    downloader.download_site(max_pages=5)