#!/usr/local/bin/perl
# Last modified:  Time-stamp: <2003-09-25 11:45:33 haines>
#
# Abstract:  get listing of files with "latest.nc" in name from http index, 
#
# Usage:  % get_latest_listing.pl [-d]
#
# Author: Sara Haines (2003-09) 

# Processing:
#    (1) Screen scrape http index listing sent by server for all HREF data
#    (2) further limit it by glob

# if debugging requested print messages to STDOUT
if (grep /[debug|DEBUG|d]/, @ARGV) {
  $debug = 0;
}

# add libraries needed for this function
use POSIX qw(strftime);

# 
$now = strftime("%Y:%m:%d %H:%M:%S",gmtime);
if ($debug) { print "\n==== Starting: $now UTC ==== Perl Version: $]\n"; }

$dir_url = $this_dir_url;

#$common = "latest.nc|OVW-QS-NRT-SECOOS102-.*.nc|Tot_PWSS.*"; 
#prefix wild card like '.*'
#$common = "status_.*.csv"; 
#for gatech, > dec2004 no permission on file

#$common = "status_20041101.*.csv"; 
#$common = "status_20041101.*.csv|.*.dat"; 
$common = $this_common; 

@latest = get_index_match($dir_url, $common);

sub get_index_match {
# Processing:
#    (0) Get html document--no checking done to make sure this is an index
#            or that it's live and accessible.  (future upgrade?)
#    (1) Screen scrape http index listing sent by server for all HREF data
#    (2) further limit it by regexp match to pattern desired by user

    # add libraries needed for this function
    use LWP::Simple;
    
    # passed parameters
    my ($path, $pattern) = @_;

    # escape any regexp chars given by user 
    # (so $pattern can't be constructed regexp)
    # $pattern =~ s/([\*\.\<\>\{\}\[\]\^\$\|\+\?\\\/])/\\$1/g;

    # (0) Get html document
    $doc = get($path);

    # some other possible html scrapes
    # @all_href = $doc =~ m{href=(.*?)>}gi;
    # @all_href = $doc =~ m{href\s*=\s*(.*?)>}gi;

    # (1) Screen scrape http for href lines
    @all_href = $doc =~ m{href\s*=[\s|"]*(.*?)[\s|"]*>}gi;

    # (2) further limit with users pattern
    @matched = grep /$pattern$/, @all_href;

    return @matched;

} # sub glob_html
# ----------------------------------------------------------------