#!/usr/local/bin/perl # Last modified: Time-stamp: <2003-09-25 11:45:33 haines> # # Abstract: get listing of files with "latest.nc" in name from http index, # # Usage: % get_latest_listing.pl [-d] # # Author: Sara Haines (2003-09) # Processing: # (1) Screen scrape http index listing sent by server for all HREF data # (2) further limit it by glob # if debugging requested print messages to STDOUT if (grep /[debug|DEBUG|d]/, @ARGV) { $debug = 0; } # add libraries needed for this function use POSIX qw(strftime); # $now = strftime("%Y:%m:%d %H:%M:%S",gmtime); if ($debug) { print "\n==== Starting: $now UTC ==== Perl Version: $]\n"; } $dir_url = $this_dir_url; #$common = "latest.nc|OVW-QS-NRT-SECOOS102-.*.nc|Tot_PWSS.*"; #prefix wild card like '.*' #$common = "status_.*.csv"; #for gatech, > dec2004 no permission on file #$common = "status_20041101.*.csv"; #$common = "status_20041101.*.csv|.*.dat"; $common = $this_common; @latest = get_index_match($dir_url, $common); sub get_index_match { # Processing: # (0) Get html document--no checking done to make sure this is an index # or that it's live and accessible. (future upgrade?) # (1) Screen scrape http index listing sent by server for all HREF data # (2) further limit it by regexp match to pattern desired by user # add libraries needed for this function use LWP::Simple; # passed parameters my ($path, $pattern) = @_; # escape any regexp chars given by user # (so $pattern can't be constructed regexp) # $pattern =~ s/([\*\.\<\>\{\}\[\]\^\$\|\+\?\\\/])/\\$1/g; # (0) Get html document $doc = get($path); # some other possible html scrapes # @all_href = $doc =~ m{href=(.*?)>}gi; # @all_href = $doc =~ m{href\s*=\s*(.*?)>}gi; # (1) Screen scrape http for href lines @all_href = $doc =~ m{href\s*=[\s|"]*(.*?)[\s|"]*>}gi; # (2) further limit with users pattern @matched = grep /$pattern$/, @all_href; return @matched; } # sub glob_html # ----------------------------------------------------------------