1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
|
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
import time
from pathlib import Path
import re
import sys
class WebpageDownloader:
def __init__(self, base_url, output_dir='downloaded_pages', ignore_navigation=True):
self.base_url = base_url
self.domain = urlparse(base_url).netloc
self.output_dir = output_dir
self.assets_dir = os.path.join(output_dir, 'assets')
self.css_dir = os.path.join(self.assets_dir, 'css')
self.downloaded_urls = set()
self.downloaded_assets = {} # Maps original URLs to local paths
self.ignore_navigation = ignore_navigation
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
# Common navigation elements to ignore
self.nav_classes = {
'nav', 'navbar', 'navigation', 'menu', 'sidebar', 'header', 'footer',
'topbar', 'top-bar', 'site-nav', 'main-nav', 'primary-nav',
'secondary-nav', 'breadcrumb', 'pagination'
}
self.nav_ids = self.nav_classes
# Create necessary directories
os.makedirs(self.css_dir, exist_ok=True)
def clean_filename(self, url):
"""Convert URL to a valid filename."""
# Extract the path and query components
parsed = urlparse(url)
path = parsed.path
if not path or path == '/':
path = 'index'
else:
path = path.strip('/')
# Handle query parameters by appending them as a hash
if parsed.query:
query_hash = hashlib.md5(parsed.query.encode()).hexdigest()[:8]
path = f"{path}_{query_hash}"
# Clean the filename
filename = re.sub(r'[<>:"/\\|?*]', '_', path)
return filename
def download_resource(self, url):
"""Download a resource (webpage or asset) and return its content."""
try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
return response.content
except requests.RequestException as e:
print(f"Error downloading {url}: {e}")
return None
def download_and_save_css(self, url, base_url):
"""Download a CSS file and save it locally."""
if url in self.downloaded_assets:
return self.downloaded_assets[url]
# Handle data URLs
if url.startswith('data:'):
return url
# Convert relative URLs to absolute
absolute_url = urljoin(base_url, url)
# Only download from same domain
if urlparse(absolute_url).netloc != self.domain:
return url
content = self.download_resource(absolute_url)
if not content:
return url
# Generate filename for CSS
css_filename = self.clean_filename(absolute_url)
if not css_filename.endswith('.css'):
css_filename += '.css'
css_path = os.path.join(self.css_dir, css_filename)
# Save the CSS file
with open(css_path, 'wb') as f:
f.write(content)
# Store the relative path from HTML to CSS
relative_path = os.path.relpath(css_path, self.output_dir)
self.downloaded_assets[url] = relative_path
return relative_path
def process_css_urls(self, css_content, base_url):
"""Process and update URLs within CSS content."""
def replace_url(match):
url = match.group(1).strip('"\'')
if url.startswith(('data:', 'http:', 'https:')):
return f"url({url})"
absolute_url = urljoin(base_url, url)
return f"url({absolute_url})"
# Replace URLs in CSS
return re.sub(r'url\((.*?)\)', replace_url, css_content.decode('utf-8'))
def save_page(self, content, url):
"""Save the webpage content to a file and process its CSS."""
if not content:
return
soup = BeautifulSoup(content, 'html.parser')
# Process external stylesheets
for link in soup.find_all('link', rel='stylesheet'):
if 'href' in link.attrs:
css_path = self.download_and_save_css(link['href'], url)
link['href'] = css_path
# Process inline styles
for style in soup.find_all('style'):
if style.string:
style.string = self.process_css_urls(style.string, url)
# Save the processed HTML
filename = self.clean_filename(url)
if not filename.endswith('.html'):
filename += '.html'
filepath = os.path.join(self.output_dir, filename)
os.makedirs(os.path.dirname(filepath), exist_ok=True)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(str(soup))
print(f"Saved: {filepath}")
def is_navigation_element(self, element):
"""Check if an element is likely part of navigation."""
if not element:
return False
current = element
while current:
if current.get('class'):
if any(nav_class in ' '.join(current['class']).lower()
for nav_class in self.nav_classes):
return True
if current.get('id'):
if current['id'].lower() in self.nav_ids:
return True
if current.get('role'):
if current['role'].lower() in {'navigation', 'menu', 'menubar'}:
return True
if current.name in {'nav', 'header', 'footer'}:
return True
current = current.parent
return False
def extract_links(self, content, current_url):
"""Extract all same-domain links from the page."""
soup = BeautifulSoup(content, 'html.parser')
links = set()
main_content = soup.find(['main', 'article']) or soup.find(class_='content') or soup.find(id='content')
for a in (main_content or soup).find_all('a', href=True):
if self.ignore_navigation and self.is_navigation_element(a):
continue
href = a['href']
if href.startswith('#'):
continue
absolute_url = urljoin(current_url, href)
if urlparse(absolute_url).netloc == self.domain:
links.add(absolute_url)
return links
def download_site(self, max_pages=50):
"""Download the website and its linked pages."""
urls_to_process = {self.base_url}
pages_downloaded = 0
while urls_to_process and pages_downloaded < max_pages:
current_url = urls_to_process.pop()
if current_url in self.downloaded_urls:
continue
print(f"Downloading: {current_url}")
content = self.download_resource(current_url)
if content:
self.save_page(content, current_url)
self.downloaded_urls.add(current_url)
pages_downloaded += 1
new_links = self.extract_links(content, current_url)
urls_to_process.update(new_links - self.downloaded_urls)
time.sleep(1)
print(f"\nDownload complete! Downloaded {pages_downloaded} pages to {self.output_dir}")
if __name__ == "__main__":
# Example usage
url = "https://www.fractalpress.com" # Replace with your target URL
if len(sys.argv) > 1:
url = sys.argv[1]
# Create downloader with navigation filtering enabled
downloader = WebpageDownloader(
url,
output_dir='downloaded_pages',
ignore_navigation=True # Set to False to include navigation links
)
downloader.download_site(max_pages=5)
|