Web scraping is the automated extraction of data from websites. In cybersecurity, it is used for OSINT (Open Source Intelligence) — gathering publicly available information about a target before an attack or security assessment.
This is completely legal when scraping publicly accessible information and is one of the most valuable skills in bug bounty and penetration testing reconnaissance phases.
pip install requests beautifulsoup4 lxml
import requests
# Make an HTTP GET request
url = "https://httpbin.org/get"
response = requests.get(url)
print(response.status_code) # 200
print(response.headers) # HTTP headers
print(response.text) # HTML content
print(response.json()) # If JSON response
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept-Language": "en-US,en;q=0.9",
"Referer": "https://google.com"
}
response = requests.get(url, headers=headers)
print(response.status_code)
from bs4 import BeautifulSoup
import requests
url = "https://example.com"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
# Get page title
print(soup.title.text)
# Find all links on the page
links = soup.find_all('a')
for link in links:
href = link.get('href')
text = link.text.strip()
if href:
print(f"{text}: {href}")
# Find element by CSS class
divs = soup.find_all('div', class_='content')
# Find by ID
header = soup.find(id='main-header')
# Get all images
images = soup.find_all('img')
for img in images:
print(img.get('src'))
import requests
from bs4 import BeautifulSoup
import re
def find_emails(url):
"""Find all email addresses on a webpage."""
try:
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers, timeout=10)
# Find emails in HTML using regex
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+.[a-zA-Z]{2,}'
emails = re.findall(email_pattern, response.text)
# Remove duplicates
unique_emails = list(set(emails))
return unique_emails
except Exception as e:
print(f"Error: {e}")
return []
# Test it
emails = find_emails("https://example.com/contact")
for email in emails:
print(f"Found: {email}")
import requests
# Use a session to maintain cookies across requests
session = requests.Session()
# Login to a website
login_data = {
"username": "your_username",
"password": "your_password"
}
session.post("https://example.com/login", data=login_data)
# Now access protected pages (session cookie is maintained)
profile = session.get("https://example.com/profile")
print(profile.text)
import time
import random
urls = ["https://example.com/page1", "https://example.com/page2"]
for url in urls:
response = requests.get(url)
print(f"Scraped: {url}")
# Wait 1-3 seconds between requests (avoid overloading server)
time.sleep(random.uniform(1, 3))
Subscribe to ONLY4YOU and get hands-on access to 40+ premium courses — Ethical Hacking, Kali Linux, Metasploit, Network Hacking, Bug Bounty & more!