import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
def extract_article(url, output_filename="extracted_article.html"):
"""
Extracts the main article content and images from a webpage and saves it to a simplified HTML file.
Args:
url: The URL of the article or blog post.
output_filename: The name of the HTML file to save the extracted content to.
"""
try:
response = requests.get(url)
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
soup = BeautifulSoup(response.content, 'html.parser')
# --- Identify Main Content Area ---
# This is the trickiest part. You'll likely need to adjust this based on the specific website structure.
# Common strategies:
# 1. Look for specific IDs or classes on container elements.
# 2. Look for the element containing the most text.
# 3. Manually inspect the HTML source to find the right element.
# Example: Let's assume the article content is within a
main_content = soup.find('div', class_='article-content')
if not main_content:
# Alternative strategy: find the tag containing the most text (crude but sometimes works)
main_content = max(soup.find_all('div'), key=lambda tag: len(tag.text)) #This will likely require manual adjustment for each site!
if not main_content:
print(f"Warning: Could not automatically identify the main content area. You may need to adjust the selection logic in the script.")
main_content = soup.body #Use the whole body as last resort
# --- Extract Images and Update Source URLs ---
images = main_content.find_all('img')
for img in images:
src = img.get('src')
if src:
# Make URLs absolute if they are relative
absolute_url = urljoin(url, src)
img['src'] = absolute_url # Update the image source
# --- Create Stripped-Down HTML ---
html_content = f"""
Extracted Article
{main_content.prettify()}
"""
# --- Save to File ---
with open(output_filename, 'w', encoding='utf-8') as f:
f.write(html_content)
print(f"Article extracted and saved to {output_filename}")
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}")
except Exception as e:
print(f"An error occurred: {e}")