#!/afs/isis/pkg/isis/bin/perl # Last modified: Time-stamp: <2003-12-09 13:37:10 haines> # # Abstract: Get NOS data for each file in nos_locs.txt # from http call dependent on station ID. Screen scrape data from html. # # Usage: % get_noaa_nos_by_station.pl [-d] # # Author: Sara Haines (2003-09) # Processing: # (0) read in station list # For each station # (1) Capture station data from http # (2) Screen scrape data from html and store to temporary file # (?) further limit it by glob (?) $delimiter = "\n"; # if debugging requested print messages to STDOUT if (grep /[debug|DEBUG|d]/, @ARGV) { $debug = 1; # remove it from the list of input args @ARGV = grep !/([debug|DEBUG|d])/, @ARGV; } # add libraries needed for this function use POSIX qw(strftime); use LWP::Simple; # $now = strftime("%Y:%m:%d %H:%M:%S",gmtime); if ($debug) { print "\n==== Starting: $now UTC ==== Perl Version: $]\n"; } $starttime = time; # (0) read in station list $doc = get("file:/opt/local/seacoos/bin/nos_locs.txt"); # splitting a new line drops new lines. @station_data = split(/$delimiter/, $doc); # if (debug) { print $doc;} # extract station_ids # there's probably a better, cleaner way to do this in perl, but ... foreach $_ (@station_data) { push @station_list, substr($_, 0, 7); } $starttime2 = time; for ($i=0; $i<=$#station_list; $i++) { @all_data = (); # clear all_data $station_id = $station_list[$i]; if ($debug) { print "$station_id\n";} $temp_data_dir = "/seacoos/data/nos/raw_data/temp"; $temp_data_file = "$temp_data_dir/$station_id.txt"; $nos_url = "http://tidesonline.nos.noaa.gov/data_read.shtml?station_info=$station_id"; # (1) Get html document $doc = get($nos_url); # print $doc; # (2) Screen scrape http for data lines # some other possible html scrapes # @all_data = $doc =~ m{>(.*?)<}sgi; # match everything between
andtag to scrape data # m{}s = match even if it includes newlines (test string as single line) # m{}g = do match more than once until end of stream (match globally) # m{}i = case insenstive match @all_data = $doc =~ m{
}sgi; # # if ($debug) { # print "------------------------------" . "\n"; # print "Data scraped from " . "\n"; # print "$nos_url:". "\n"; # for (@all_data) { # print($_); # } # } #print "Saving data to $temp_data_file ...\n\n\n"; open(OUTFILE,">$temp_data_file")|| die('Unable to write temporary data file...'); $output_data=join("\n",@all_data); print OUTFILE "$output_data"; } # for ($i=0; $i<=$#station_list; $i++) $subtime = time - $starttime2; if ($debug) { print (" ... archive time = $subtime (seconds)\n"); } $cummtime = time - $starttime; if ($debug) { print ("Total script time = $cummtime (seconds)\n"); }(.*?)