#!/usr/bin/env ruby # -*- ruby -*- # # Downloads all the links on a page given the passed in url and # possible regular epressions # # E.g. # # download_all http://spudtrooper.org ; Download all # download_all http://spudtrooper.org *.js ; Download js files # download_all http://spudtrooper.org /*.js/ ; Same: avoid files in pwd # download_all http://spudtrooper.org/*.js ; Same # download_all http://spudtrooper.org *.js *.html ; Download js+html files # require 'uri' require 'net/http' require 'open-uri' class DownloadAll attr_writer :overwrite, :outdir def initialize @regexps = [] @urls = [] @urls2regexps = {} @overwrite = false @outdir = '.' end def download_all urls = @urls i = 0 n = log10 urls.size urls.each do |u| url = u url = 'http://' + url if url !~ /\:\/\// STDERR.printf "[%#{n}d/%#{n}d] %s\n",(i+1),@urls.size,url rs = regexps_for_url(u) res = regexps_for_url(u).map do |x| re = x re = re.gsub /\./, '\\.' re = re.gsub /\*/, '.*' if re =~ /^\/.*\/$/ re = re.gsub /^\//,'' re = re.gsub /\/$/,'' end /.*#{re}.*/ end download download_uris(url,res) i += 1 end end # String -> Void: Add a url to use def add_url(url) @urls << url @urls.uniq! end # String (String) -> Void: Add a regular expression to use # If 'url' provided we add a regexp specific to only that url # Otherwise, we add it for all URLs def add_regexp(regexp,url=nil) if url lst = @urls2regexps[url] || [] lst << regexp @urls2regexps[url] = lst.uniq else @regexps << regexp @regexps.uniq! end end # [String] -> Integer[exit code] def real_main(argv) args = [] i = 0 while i [RE]: List of REs for a specific URL # # Since we can have a difference list of REs for each url, this will # return the list of REs for a specific URL def regexps_for_url(url) res = @regexps lst = @urls2regexps[url] res = res + lst if lst res.uniq end # String [Regexp] -> [URI]: URIs to download contained in 'url' for 'regexps' # # If regexps is empty, we download all links, otherwise we download # the links whose HREF attribute match at least once of the RE of # 'regexps' def download_uris(url,regexps) uris = [] str = open(url).read return uris if not str str.split("\n").each do |line| line.scan /[href|src]=[\'\"]?([^\"\'\b]+)[\'\"\b]/i do |res| href = res[0] if regexps.empty? uris << URI.join(url,href) else regexps.each do |re| if re.match href uris << URI.join(url,href) break end end end end end return uris end # Real -> Integer def log10(n) Math.log10(n+1).ceil end # [URI] -> Void def download(uris) return if uris.empty? outdir = @outdir Dir.mkdir outdir if not File.exist? outdir i = 0 n = log10 uris.size uris.each do |uri| download_uri uri,outdir,n,i,uris.size i += 1 end end def download_uri(uri,outdir,n,i,size) Net::HTTP.start uri.host do |http| file = File.join outdir,uri.path.gsub(/.*\//,'') filestr = file.gsub /^\.+\//,'' should_download = true # Prompt if file exists if not @overwrite and File.exist? file loop = true while loop print 'Overwrite ' + filestr + '? [Y/n/A] ' STDOUT.flush ans = STDIN.gets.downcase.strip case ans when 'y' should_download = true loop = false break when 'n' should_download = false loop = false break when 'a' @overwrite = true should_download = true loop = false break else STDERR.puts 'Invalid answer: ' + ans end end end if should_download STDERR.printf "[%#{n}d/%#{n}d] %s -> %s\n",(i+1),size,uri,filestr resp = http.get uri.path File.open file,'wb' do |out| out.write resp.body end end end end # Prints help def print_help STDERR.puts 'Usage ' + File.basename($0) + ' [options] url | regexp' STDERR.puts 'where options include' STDERR.puts ' -h || --help print this message' STDERR.puts ' -f || --force overwrite existing files' STDERR.puts ' -d || --outdir write output files to ' STDERR.puts 'and where at least one url is required.' end end DownloadAll.new.main ARGV