#!/usr/bin/env bash # # Extract URLs from an HTML document or documents. # # Author: Tom Ryder # Copyright: 2016 # License: Public domain # # Set a sensible locale so that sort(1) doesn't act dumbly LANG=C.UTF-8 export LANG # Check we have the programs we need hash pup || exit # Emit the content of the args, or stdin cat -- "${@:-/dev/stdin}" | ## shellcheck disable=SC2002 # Pipe it through a pup filter to get all the values of the a href elements pup 'a attr{href}' | # Sort it uniquely sort | uniq