#!/usr/bin/env python3 import os import re import requests from urllib.parse import urljoin, urlparse # Target NAID NAID = "580103959" BASE_URL = f"https://catalog.archives.gov/id/{NAID}" OUT_DIR = "nsa_uap_pdfs" os.makedirs(OUT_DIR, exist_ok=True) session = requests.Session() session.headers.update({ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36' }) print(f"Fetching {BASE_URL}") resp = session.get(BASE_URL, timeout=10) resp.raise_for_status() html = resp.text # Extract all PDF links from the page # Look for href patterns pointing to PDFs pdf_links = re.findall(r'href=(["\'])([^"\']*\.pdf[^"\']*)\1', html, re.IGNORECASE) pdf_urls = [] seen = set() for _, url in pdf_links: # Normalize URL if url.startswith("//"): url = "https:" + url elif url.startswith("/"): url = urljoin(BASE_URL, url) elif not url.startswith("http"): url = urljoin(BASE_URL, url) if url not in seen: seen.add(url) pdf_urls.append(url) print(f"Found {len(pdf_urls)} PDF URLs on the catalog page") if not pdf_urls: print("No PDFs found on the main page. Looking for digital objects...") # Try to find JSON-LD structured data or API calls json_ld = re.findall(r'', html, re.DOTALL) if json_ld: print(f"Found {len(json_ld)} JSON-LD blocks, but extraction needs refinement") # Download PDFs for url in pdf_urls: fname = urlparse(url).path.split("/")[-1] if not fname.endswith(".pdf"): fname += ".pdf" out_path = os.path.join(OUT_DIR, fname) if os.path.exists(out_path): print(f"Skipping existing {fname}") continue print(f"Downloading {fname}...") try: resp = session.get(url, stream=True, timeout=15) resp.raise_for_status() with open(out_path, "wb") as f: for chunk in resp.iter_content(chunk_size=8192): if chunk: f.write(chunk) print(f" ✓ Saved {fname}") except Exception as e: print(f" ✗ Error downloading {url}: {e}") print(f"\nDownload complete. Files saved to {OUT_DIR}")