$artist - $title.mp3

#use strict;#if-debug sub version_cpp { return '2.0.0.0072'; } =head1 HEAD Copyright 1997-2004 by Zoltan Milosevic, All Rights Reserved See http://www.xav.com/scripts/search/ for more information. If you edit the source code, you'll find it useful to restore the function comments and #&Assert checks: cd "search/searchmods/powerusr/" hacksubs.pl build_map hacksubs.pl restore_comments hacksubs.pl assert_on common_parse_page.pl contains all the functions used to index local documents. This library is always loaded during admin requests. This library needs to be available during public search requests, because if there is a runtime realm involved, then searching that realm will require the local indexing functions. Since the public search code needs to be kept as slim as possible, and because runtime realms are rare, the search code will optimize away from loading this library if no runtime realms are detected. =cut sub test_handler_syntax { my ($b_verbose, $folder, $name, %utilities) = @_; my $err = ''; Err: { my $hpath = &he($folder); print qq!

The $name setting is set to '$hpath'.

\n! if ($b_verbose); if ($hpath =~ m!\s!) { $err = "folder '$folder' contains whitespace. This is not supported"; next Err; } if ($hpath eq '') { $err = "folder string is empty. $name is not integrated with this script"; next Err; } # file existence test: print qq!

The -e file existence test returns ! . ((-e $folder) ? 'true' : 'FALSE') . qq! on this path.

\n! if ($b_verbose); print qq!

The -d is-directory test returns ! . ((-d $folder) ? 'true' : 'FALSE') . qq! on this path.

\n! if ($b_verbose); if ($folder !~ m!(\\|/)$!) { print qq!

Warning: the path "$hpath" does not end in a trailing slash of "\\" or "/". This setting must have a proper trailing slash.

\n! if ($b_verbose); } # slash convention: my ($good, $bad, $extension) = ("/", "\\", ''); if ($^O =~ m!mswin!i) { print qq!

Perl's \$^O operating system variable returns $^O. This pattern matches to m/mswin/. Assuming Microsoft Windows. Assuming backslash convention "\\" as folder separator.

\n! if ($b_verbose); ($good, $bad, $extension) = ("\\", "/", '.exe'); } else { print qq!

Perl's \$^O operating system variable returns $^O. This does not pattern match to m/mswin/. Assuming not Microsoft Windows. Assuming forward slash convention "/" as folder separator.

\n! if ($b_verbose); } my $qmbad = quotemeta($bad); my $qmgood = quotemeta($good); if ($folder =~ m!$qmbad!) { print qq!

Warning: path "$hpath" contains slash characters "$bad" which don't appear native to this platform. The native slash convention must be used because this path string will be used to shell out to the operating system. The operating system will not be as tolerant as you or I in equating "/" and "\\" as folder separators.

\n! if ($b_verbose); } else { # we know must be either bad or good due to trailing-slash-check above print qq!

Path "$hpath" contains native slash convention; it matches "$good" and does not match "$bad". Great job\!

\n! if ($b_verbose); } my @files = (); foreach (sort keys %utilities) { push(@files, $_ . $extension); } print "

Performing discovery tests on individual executable files...

\n" if ($b_verbose); foreach (@files) { my $full = $folder . $_; my $hfull = &he($full); print "

$hfull

\n" if ($b_verbose); if (-e $full) { print "

-e file existence: TRUE

\n" if ($b_verbose); if (-X $full) { print "

-X is-executable: TRUE

\n" if ($b_verbose); } else { print "

-X is-executable: FALSE

\n" if ($b_verbose); } } else { print "

-e file existence: FALSE

\n" if ($b_verbose); } } print <<"EOM" if ($b_verbose);

Making system calls to test inter-operability.

This script will shell out to the commands, without any arguments. The utilities should return their usage syntax. This text will be validated to confirm that it references a known string.

EOM foreach (sort keys %utilities) { my ($stdout, $stderr) = &get_command_out( qq!"$folder$_"!, $b_verbose ); if (($stdout !~ m!$utilities{$_}!is) and ($stderr !~ m!$utilities{$_}!is)) { $err = "command output did not match expected pattern '$utilities{$_}'. Utilities may not be functioning properly, or the system calls and I/O redirection from Perl may not be functioning properly"; next Err; } print "

Success: verified that command output matched '$utilities{$_}'.

\n" if ($b_verbose); } last Err; } return $err; } sub handlers_init { my ($b_load_all, $b_verbose) = @_; $::private{'handlers'} = []; my $handler; $handler = { 'enabled' => 1, 'name' => 'MP3-Internal', 'help' => qq!$::const{'help_file'}1183.html!, 'read_last_bytes' => 128, # special case for supporting MP3 ID3v1 metadata # affects only the passing of $binary_slice, not $alt_file_path 'extension_pattern' => '^mp3$', 'content_type_pattern' => 'audio/(mpeg-3|mpeg)', 'converter' => sub{ my ($binary_slice, $alt_file_path, $URL, $b_verbose) = @_; my $text = ''; my $err = ''; Err: { if (($binary_slice eq '') and ($alt_file_path)) { # load from file if (not -e $alt_file_path) { $err = "file '$alt_file_path' does not exist"; next Err; } my $fsize = -s $alt_file_path; if ($fsize < 128) { # impossible to have ID3v1 metadata w/o 128-byte minimum size $err = "unable to extract text from MP3; file size $fsize bytes is less than minimum required 128 bytes"; next Err; } # read in final 128 bytes unless (open(FILE, "<$alt_file_path")) { $err = "unable to read file '$alt_file_path' - $!"; next Err; } binmode(FILE); seek(FILE, -128, 2); read(FILE,$binary_slice,128); close(FILE); } $binary_slice = substr( $binary_slice, -128 ); my $len = length($binary_slice); if ($b_verbose) { my $hslice = &he($binary_slice); print "

TRACE: binary slice of $len bytes reads:

$hslice

\n"; } my ($tag, $title, $artist, $album, $year, $comment) = unpack('A3A30A30A30A4A30', $binary_slice); if ($tag ne 'TAG') { # oops - no ID3v1 data $err = "no text found within MP3 file (the last 128 bytes did not start with literal 'TAG' as expected)"; next Err; } $artist = "$2 $1" if ($artist =~ m!^(.*), (the)$!i); $text = qq!\n$artist - $title.mp3\n\n\n$album $year $title $artist $comment!; if ($b_verbose) { my $htext = &he($text); print "

TRACE: extracted ID3v1 metadata to following HTML string:

$htext

\n"; } } return ($err, $text); }, }; if (($handler->{'enabled'}) or ($b_load_all)) { print "

TRACE: binary-to-HTML handler $handler->{'name'} enabled.

\n" if ($b_verbose); push( @{ $::private{'handlers'} }, $handler ); } elsif ($b_verbose) { print "

TRACE: NOT LOADING binary-to-HTML handler $handler->{'name'} (pre-flight test failed)

\n"; } $handler = { 'enabled' => (($::private{'pdf utility folder'}) and (-e $::private{'pdf utility folder'})), 'name' => 'XPDF', 'help' => qq!$::const{'help_file'}1181.html!, 'read_last_bytes' => 0, 'extension_pattern' => '^pdf$', 'content_type_pattern' => 'application/pdf', 'test_syntax' => sub{ my ($b_verbose) = @_; return &test_handler_syntax( $b_verbose, $::private{'pdf utility folder'}, 'XPDF', 'pdfinfo' => 'Usage:\s+pdfinfo', 'pdftotext' => 'Usage:\s+pdftotext', ); }, 'converter' => sub{ my ($binary_slice, $alt_file_path, $URL, $b_verbose) = @_; my $text = ''; my $err = ''; Err: { my $b_delete_temp = 0; my $trustcharset = q!^[a-zA-Z0-9\_\-\.\:\ \/]+$!; #alphanumerics, hyphen,underscore,period,colon,forward-slash if ($alt_file_path) { if ($alt_file_path =~ m!$trustcharset!) { if ($alt_file_path =~ m! !) { # charset okay, but has embedded space - double-quote $alt_file_path = qq!"$alt_file_path"!; } } else { ($err, $binary_slice) = &ReadFileL( $alt_file_path ); next Err if ($err); $alt_file_path = ''; } } if ($binary_slice) { # create a temp file with a random name $alt_file_path = 'temp' . $$ . rand() . '.pdf'; $err = &WriteFile($alt_file_path, $binary_slice); next Err if ($err); $b_delete_temp = 1; } my ($headtext, $bodytext, $stderr); ($headtext, $stderr) = &get_command_out( qq!"$::private{'pdf utility folder'}pdfinfo" $alt_file_path!, $b_verbose ); ($bodytext, $stderr) = &get_command_out( qq!"$::private{'pdf utility folder'}pdftotext" -raw $alt_file_path -!, $b_verbose ); if ($b_delete_temp) { unless (unlink($alt_file_path)) { $err = &pstr(54, $alt_file_path, $!); next Err; } } my $headers = ''; foreach (split(m!\n!s, $headtext)) { next unless (m!^(.*?)\:\s*(.+?)$!); my ($name, $value) = (&Trim($1), &Trim($2)); next unless ($value); if (lc($name) eq 'title') { $headers .= "\t$value\n"; } else { $headers .= "\t\n"; } } $bodytext = &he($bodytext); $text = " $headers

$bodytext

"; last Err; } return ($err, $text); }, }; if (($handler->{'enabled'}) or ($b_load_all)) { print "

TRACE: binary-to-HTML handler $handler->{'name'} enabled.

\n" if ($b_verbose); push( @{ $::private{'handlers'} }, $handler ); } elsif ($b_verbose) { print "

TRACE: NOT LOADING binary-to-HTML handler $handler->{'name'} (pre-flight test failed)

\n"; } $handler = { 'enabled' => (($::private{'antiword utility folder'}) and (-e $::private{'antiword utility folder'})), 'name' => 'Antiword', 'help' => qq!$::const{'help_file'}1182.html!, 'read_last_bytes' => 0, 'extension_pattern' => '^doc$', 'content_type_pattern' => 'application/msword', 'test_syntax' => sub{ my ($b_verbose) = @_; return &test_handler_syntax( $b_verbose, $::private{'antiword utility folder'}, 'Antiword', 'antiword' => 'Usage:\s+antiword', ); }, 'converter' => sub{ my ($binary_slice, $alt_file_path, $URL, $b_verbose) = @_; my $text = ''; my $err = ''; Err: { my $b_delete_temp = 0; my $trustcharset = q!^[a-zA-Z0-9\_\-\.\:\ \/]+$!; #alphanumerics, hyphen,underscore,period,colon,forward-slash if ($alt_file_path) { if ($alt_file_path =~ m!$trustcharset!) { if ($alt_file_path =~ m! !) { # charset okay, but has embedded space - double-quote $alt_file_path = qq!"$alt_file_path"!; } } else { ($err, $binary_slice) = &ReadFileL( $alt_file_path ); next Err if ($err); $alt_file_path = ''; } } if ($binary_slice) { # create a temp file with a random name $alt_file_path = 'temp' . $$ . rand() . '.doc'; $err = &WriteFile($alt_file_path, $binary_slice); next Err if ($err); $b_delete_temp = 1; } $ENV{'HOME'} = '.'; my $stderr; ($text, $stderr) = &get_command_out( "$::private{'antiword utility folder'}antiword -t $alt_file_path", $b_verbose ); if ($b_delete_temp) { unless (unlink($alt_file_path)) { $err = &pstr(54, $alt_file_path, $!); next Err; } } last Err; } return ($err, $text); }, }; if (($handler->{'enabled'}) or ($b_load_all)) { print "

TRACE: binary-to-HTML handler $handler->{'name'} enabled.

\n" if ($b_verbose); push( @{ $::private{'handlers'} }, $handler ); } elsif ($b_verbose) { print "

TRACE: NOT LOADING binary-to-HTML handler $handler->{'name'} (pre-flight test failed)

\n"; } } sub handler_match { my ($URL, $content_type, $b_verbose) = @_; my $p_sub = undef(); my $read_last_bytes = 0; Err: { &handlers_init(0,$b_verbose) unless (exists($::private{'handlers'})); print "

TRACE: handler_match: URL:$URL Content-Type '$content_type'

\n" if ($b_verbose); my $type_identifier; my $match_against; if ($content_type) { $type_identifier = $content_type; $match_against = 'content_type_pattern'; } else { # match on extension $type_identifier = $URL; $type_identifier =~ s!\?.*$!!s; # strip query string $type_identifier =~ s!\#.*$!!s; # strip fragment identifier $type_identifier = ($type_identifier =~ m!\.(\w+)$!) ? $1 : 'null'; $match_against = 'extension_pattern'; } print "

TRACE: handler_match: comparing '$type_identifier' to '$match_against' property of each handler.

\n" if ($b_verbose); my $p_handler; foreach $p_handler (@{ $::private{'handlers'} }) { my $pattern = $p_handler->{$match_against}; unless ($type_identifier =~ m!$pattern!i) { print "

TRACE: handler_match: string '$type_identifier' did not match pattern $pattern.

\n" if ($b_verbose); next; } print "

TRACE: handler_match: string '$type_identifier' matched pattern $pattern. Activating handler.

\n" if ($b_verbose); $p_sub = $p_handler->{'converter'}; $read_last_bytes = $p_handler->{'read_last_bytes'}; last Err; } print "

TRACE: handler_match: no binary handlers matched; normal parsing rules will be used.

\n" if ($b_verbose); last Err; } return ($p_sub, $read_last_bytes); } sub load_custom_metadata { my ($url, $p_metadata) = @_; my $err = ''; Err: { last Err unless ($::Rules{'use dbm routines'}); my %custom = (); eval { if (dbmopen( %custom, 'custom_metadata', 0666 )) { if (defined($custom{$url})) { my $data = $custom{$url}; my $pair; foreach $pair (split(m! !s, $data)) { next unless ($pair =~ m!^(.+?)=(.*)$!s); $$p_metadata{$1} = &ud($2); } } dbmclose( %custom ); } }; last Err; } return $err; } sub get_command_out { my ($command, $b_verbose) = @_; my $stdout = ''; my $stderr = ''; block: { my $temp = "delete_me." . time() . rand(); unlink($temp) if (-e $temp); select(STDERR); $| = 1; select(STDOUT); my $b_restore_ok = 0; if (open(OLDERR, ">&STDERR")) { binmode(OLDERR); $b_restore_ok = 1; } else { print "

Warning: unable to save STDERR file handle.

\n"; } my $b_close = 0; if (open(STDERR, ">$temp")) { $b_close = 1; #ok } elsif ($b_verbose) { print "

Warning: unable to redirect STDERR to temp file '$temp' - $! - $^E.

\n"; } local $/ = undef(); print "

Status: launching command '$command' as child process...

\n" if ($b_verbose); $stdout = `$command`; if ($b_close) { close(STDERR); # changed 0071 my $err = ''; ($err, $stderr) = &ReadFileL($temp); unlink($temp) if (-e $temp); if ($b_restore_ok) { unless (open(STDERR, ">&OLDERR")) { print "

Warning: unable to restore STDERR file handle.

\n"; } } } if ($b_verbose) { my $len = length($stderr); if ($len) { print qq!

Status: the process returned the following $len bytes on STDERR:

! . &he($stderr) . qq!

\n!; } else { print qq!

Status: the process did not write to STDERR.

\n!; } $len = length($stdout); if ($len) { print qq!

Status: the process returned the following $len bytes on STDOUT:

! . &he($stdout) . qq!

\n!; } else { print qq!

Status: the process did not write to STDOUT.

\n!; } } } return ($stdout, $stderr); } sub parse_meta_header { my ($p_text, $name) = @_; my $value = ''; $name = quotemeta($name); #&Assert('SCALAR' eq ref($p_text)); #&Assert($name); #changed 0054 - allow meta="foo"content="bar" w/o intervening whitespace #changed 0061 - non-greedy match {0,4096}? matches first META tag, not last if ($$p_text =~ m!^.{0,4096}?]*?)content\s*=\s*(.*?)\s*/?>!is) { $value = $4; } elsif ($$p_text =~ m!^.{0,4096}?]*?)\s+(name|http-equiv)\s*=\s*\"?\'?fdse-$name\s*?(\"|\')?\s*/?>!is) { $value = $1; } elsif ($$p_text =~ m!^.{0,4096}?]*?)content\s*=\s*(.*?)\s*/?>!is) { $value = $4; } elsif ($$p_text =~ m!^.{0,4096}?]*?)\s+(name|http-equiv)\s*=\s*\"?\'?$name\s*?(\"|\')?\s*/?>!is) { $value = $1; } $value = &Trim($value); if ($value =~ m!^\"(.*)\"!s) { $value = $1; } elsif ($value =~ m!^\'(.*)\'!s) { $value = $1; } return $value; } sub ParseRobotFile { local $_; my ($RobotText, $my_user_agent) = @_; my @forbidden_paths = (); my @star_paths = (); my $applies = 0; # 0 => not me; 1 => me by substr match; 2 => me by * match (substr match wins over *) my $is_ua_bloc = 0; my $ua_is_god = 0; foreach (split(m!\015|\012!, $RobotText)) { if (m!^user-agent:([^\#]*)!i) { if ($is_ua_bloc == 0) { # we are at the start of a new UA block # do we already have a perfectly good substr match? first one wins last if ($applies == 1); $ua_is_god = 0; $is_ua_bloc = 1; $applies = 0; } my $agent = &Trim($1); next unless ($agent); $agent = quotemeta($agent); #changed 0051; now matching on bare "fdse" instead of "fdse robot" if (($my_user_agent =~ m!$agent!i) or ('fdse' =~ m!$agent!i)) { # ua:fdse overrides any other $applies value $applies = 1; } elsif (($applies == 0) and ($agent eq '\*')) { # ua:* overrides no-match but not a substr match $applies = 2; } } elsif ((not $ua_is_god) and (m!^disallow:([^\#]*)!i)) { $is_ua_bloc = 0; next unless ($applies); my $forbidden_path = &Trim($1); if ($forbidden_path eq '') { # null lines mean we are god, nuff said: $ua_is_god = 1; # clear any current data: if ($applies == 1) { @forbidden_paths = (); } else { @star_paths = (); } } elsif ($applies == 1) { #cleanse the data my $virtual = 'http://virtual' . $forbidden_path; my ($err, $clean) = &uri_parse( $virtual ); if ((not $err) and ($clean =~ m!^http://virtual(.+)$!)) { push(@forbidden_paths, $1); } } else { #cleanse the data my $virtual = 'http://virtual' . $forbidden_path; my ($err, $clean) = &uri_parse( $virtual ); if ((not $err) and ($clean =~ m!^http://virtual(.+)$!)) { push(@star_paths, $1); } } } } if ($applies == 1) { # okay, we had a substr match return @forbidden_paths; } else { # return whatever was present for * or nothing return @star_paths; } } sub Capitalize { my $Text = defined($_[0]) ? $_[0] : ''; my ($NewText, $Word, $NonWord) = (''); my @NoCaps = ('the', 'an', 'a', 'of', 'and', 'or'); #changed 0027 - using array not qw{} my $b_first_word = 1; while ($Text =~ m!^([\w|\'|\-]*)(\W*)(.*?)$!) { ($Word, $NonWord, $Text) = (lc($1), $2, $3); last unless ($Word or $NonWord); my $qm_Word = quotemeta($Word); $Word = ucfirst($Word) unless ((grep {m!^$qm_Word$!} @NoCaps) and (not $b_first_word)); $NewText .= $Word . $NonWord; $b_first_word = ($NonWord =~ m![\:|\.|\!|\?|\-]!); } return $NewText; } sub SearchRunTime { local $_; my ($p_realm_data, $DocSearch, $r_pages_searched, $r_hits) = @_; my $err = ''; Err: { my $URL = ''; my @WordCount = (); my ($WordMatches, $sort_num, $u, $t, $d, $k, $hdr, $n_context_matches, $context_str, $delta, $text); my ($title, $description) = (); undef($@); my $fr = &fdse_filter_rules_new($p_realm_data); my $gf = &GetFiles_new(); $err = $gf->create_file_list( 'base_dir' => $$p_realm_data{'base_dir'}, 'base_url' => $$p_realm_data{'base_url'}, 'fr' => \$fr, 'tempfile' => "runtime.file_list. " . int(10000 * rand()) . ".txt", 'verbose' => 0, ); next Err if ($err); my $count = 0; my $record_err_msg = ''; while (1) { my ($lastmodt, $size, $fullfile, $basefile, $url) = $gf->get_next_file(); last unless ($url); my %pagedata = (); ($record_err_msg, $url) = &pagedata_from_file( $fullfile, $url, \%pagedata, \$fr ); next if ($record_err_msg); ($record_err_msg, $_) = &text_record_from_hash( \%pagedata ); next if ($record_err_msg); eval($DocSearch); die($@) if ($@); } $err = $gf->quit(0); next Err if ($err); last Err; } continue { &ppstr(29,$err); } } sub check_parse_patterns { my ($doctext, $p_metadata) = @_; my $err = ''; Err: { last Err unless (-s 'parse_patterns.txt'); my $text = ''; ($err, $text) = &ReadFileL('parse_patterns.txt'); next Err if ($err); my $rule_str; foreach $rule_str (split(m!\r|\n|\015|\012!s, $text)) { next if ($rule_str =~ m!^\s*$!s); my @fields = split(m!,!s, $rule_str); my ($pattern, $index, $key) = (&ud($fields[0]), $fields[1], $fields[2]); next if ($$p_metadata{$key}); # first one wins $err = &check_regex($pattern); next Err if ($err); if ($doctext =~ m!$pattern!is) { my @out = ('', $1, $2, $3, $4, $5, $6, $7, $8, $9); $$p_metadata{ $key } = $out[$index]; $$p_metadata{ $key } = ' ' if ($$p_metadata{ $key } eq ''); # so that we can pass as if ($xx) } } last Err; } continue { &ppstr(29,$err); } } sub parse_html_ex { my ($HTML_Text, $URL, $b_SaveLinks, $r_link_array, $p_pagedata) = @_; my $b_verbose = 0; #&Assert('HASH' eq ref($p_pagedata)); local $_; if ($b_SaveLinks) { #&Assert('ARRAY' eq ref($r_link_array)); } # replace line breaks with spaces: $HTML_Text =~ tr!\r\n\t! !; # replace high spaces with \s: my $high_space = chr(160); $HTML_Text =~ s!$high_space! !og; # Initialize return values: foreach ('title', 'description', 'keywords', 'text', 'links') { $$p_pagedata{$_} = ''; } # strip unwanted portions of the HTML: $HTML_Text =~ s!(||.*?|<\!--\s*robots?\s+content\s*=\s*\"?none\"?\s*-->.*?<\!--\s*/robots?\s*-->)! !ig; # extract links: if (($b_SaveLinks) or ($::Rules{'index links'})) { # This critical chunk of code parses all of the links out of an HTML file, saving them # for later crawling, or for indexing as part of the record for the given HTML file. my $str_text = $HTML_Text; #changed 0031 - Remove sections blocked by FDSE:ROBOT tags: $str_text =~ s!(.*?|<\!--\s*robots?\s+content\s*=\s*\"?nofollow\"?\s*-->.*?<\!--\s*/robots?\s*-->)! !ig; # remove remaining comment tags: $str_text =~ s!<\!--.*?-->! !g; my $hostname = ''; $hostname = quotemeta($1) if ($URL =~ m!^http://(.*?)/!); my ($core_tag, $attribs) = (); my @links = split(m!<(A|FRAME|IFRAME|BASE|AREA) (.*?)>!i, $str_text); my $v_base = [ &uri_parse($URL) ]; print "

Status: beginning link extraction routine...

\n" if ($b_verbose); my $x = 1; for ($x = 1; $x < $#links; $x += 3) { my $err; ($core_tag, $attribs) = ($links[$x], $links[$x+1]); my $ThisLink = ''; # changed 0067 #base href, a href, area href #frame src, iframe src # bug: href|src="", then href|src='', then href|src=bare # problem is that

will cause the first pattern to match, but by incorrectly extracting "bar.gif" # # extracts "pics/weiter.gif" incorrectly my $focus_attrib = 'href'; if ($core_tag =~ m!^i?frame$!i) { $focus_attrib = 'src'; } if ($b_verbose) { my $tag = &he( "<$core_tag $attribs>" ); print "

Status: analyzing tag for focus attrib $focus_attrib: $tag

\n"; } # double-quoted attribute: if ($attribs =~ m!(^|\s)$focus_attrib\s*=\s*\"([^\"]*)!is) { $ThisLink = $2; } # single-quoted attribute: elsif ($attribs =~ m!(^|\s)$focus_attrib\s*=\s*\'([^\']*)!is) { $ThisLink = $2; } # unquoted attribute: elsif ($attribs =~ m!(^|\s)$focus_attrib\s*=\s*([^\s\>]*)!is) { $ThisLink = $2; } else { print "

Status: no match found on focus attrib $focus_attrib=VALUE; skipping to next.

\n" if ($b_verbose); next; } $ThisLink = &Trim($ThisLink); $ThisLink = '.' if ($ThisLink eq ''); if (lc($core_tag) eq 'base') { $v_base = [ &uri_parse($ThisLink) ]; next; } $$p_pagedata{'links'} .= ' '.$ThisLink if ($::Rules{'index links'}); next unless $b_SaveLinks; next if ((not $::Rules{'crawler: follow query strings'}) and ($ThisLink =~ m!\?!)); print "

Status: URL fragment string is $ThisLink (in relation to base URL $v_base->[1]).

\n" if ($b_verbose); ($err, $ThisLink) = &uri_merge($v_base, $ThisLink); next if ($err); print "

Status: URL after merge is $ThisLink.

$::str[73]: $cancel_msg"; } next Err; } $err = $obj->Merge(); next Err if ($err); } return $err; } sub ReadWrite { my ($self, $filename) = @_; $self->{'rname'} = $filename; $self->{'ename'} = "$filename.exclusive_lock_request"; $self->{'wname'} = "$filename.working_copy"; my ($p_rhandle, $rname, $p_whandle, $wname, $p_ehandle, $ename) = ($self->{'p_rhandle'}, $self->{'rname'}, $self->{'p_whandle'}, $self->{'wname'}, $self->{'p_ehandle'}, $self->{'ename'}); my $err = ''; Err: { $err = $self->LockFile_get_read_access(); next Err if ($err); # Create the appropriate files to secure our access from other LockFile.pm processes: unless (open($$p_ehandle, "+>$ename")) { $err = &pstr(70,$ename,$!); next Err; } unless (binmode($$p_ehandle)) { $err = &pstr(39,$ename,$!); next Err; } unless (&FlockEx($p_ehandle,6)) { $err = &pstr(76,$ename,$!); close($$p_ehandle); next Err; } my $h = select($$p_ehandle); $| = 1; select($h); print { $$p_ehandle } ''; chmod($::private{'file_mask'},$ename); unless (open($$p_whandle,">$wname")) { $err = &pstr(43,$wname,$!); next Err; } chmod($::private{'file_mask'},$wname); unless (&FlockEx($p_whandle, 6)) { $err = &pstr(76,$wname,$!); close($$p_whandle); next Err; } unless (binmode($$p_whandle)) { $err = &pstr(39,$wname,$!); next Err; } chmod($::private{'file_mask'},$rname); unless (open($$p_rhandle, "<$rname")) { $err = &pstr(44,$rname,$!); next Err; } unless (&FlockEx($p_rhandle, 5)) { $err = &pstr(41,$rname,$!); close($$p_rhandle); next Err; } unless (binmode($$p_rhandle)) { $err = &pstr(39,$rname,$!); next Err; } } return ($err, $p_rhandle, $p_whandle); } sub get_wname { my ($self) = @_; return $self->{'wname'}; } sub Cancel { my ($self) = @_; my ($p_rhandle, $rname, $p_whandle, $wname, $p_ehandle, $ename) = ($self->{'p_rhandle'}, $self->{'rname'}, $self->{'p_whandle'}, $self->{'wname'}, $self->{'p_ehandle'}, $self->{'ename'}); my $err = ''; Err: { # Release the read lock on $readfile, retain data $err = &freeh($p_rhandle,$rname); # Delete writefile, abandoning changes: $err = &freeh($p_whandle,$wname,1); # Delete exclusive_lock_request file: $err = &freeh($p_ehandle,$ename,1); } return $err; } sub Resume { my ($self, $filename) = @_; $self->{'rname'} = $filename; $self->{'ename'} = "$filename.exclusive_lock_request"; $self->{'wname'} = "$filename.working_copy"; my ($p_rhandle, $rname, $p_whandle, $wname, $p_ehandle, $ename) = ($self->{'p_rhandle'}, $self->{'rname'}, $self->{'p_whandle'}, $self->{'wname'}, $self->{'p_ehandle'}, $self->{'ename'}); my $err = ''; Err: { unless (open($$p_ehandle, "+<$ename")) { $err = &pstr(70, $ename, $! ); next Err; } unless (binmode($$p_ehandle)) { $err = &pstr(39, $ename, $! ); next Err; } unless (&FlockEx($p_ehandle, 6)) { $err = &pstr(76, $ename, $! ); next Err; } my $e_size = -s $ename; unless ($e_size == length(pack('LLL'))) { $err = "unable to resume file read/write operation - lock file was only size $e_size and did not contain information about where to resume the process. You will have to manually restart this process"; next Err; } my $data; unless (12 == read($$p_ehandle, $data, 12)) { $err = "error while reading from file '$ename' - $! - $^E"; next Err; } my ($pid, $read_depth, $write_depth) = unpack('LLL', $data); unless (defined($pid)) { $err = "unable to resume operation -- expected file '$ename' to contain data about the last PID but the value was not defined"; next Err; } unless (defined($read_depth)) { $err = "unable to resume operation -- expected file '$ename' to contain data about the read_depth but the value was not defined"; next Err; } unless (defined($write_depth)) { $err = "unable to resume operation -- expected file '$ename' to contain data about the write_depth but the value was not defined"; next Err; } unless ($write_depth =~ m!^\d+$!) { $err = "unable to resume operation -- write_depth returned non-integer value '$write_depth'"; next Err; } unless (open($$p_whandle, "+<$wname")) { $err = &pstr(70, $wname, $! ); next Err; } unless (binmode($$p_whandle)) { $err = &pstr(39, $wname, $! ); next Err; } unless (&FlockEx($p_whandle, 6)) { $err = &pstr(76, $wname, $! ); next Err; } unless (seek($$p_whandle, $write_depth, 0)) { $err = &pstr(72,$write_depth,$wname,$!); next Err; } my $w_size = -s $wname; if ($write_depth > $w_size) { &ppstr(53, &pstr(82, $write_depth, $wname, $w_size ) ); } elsif ($write_depth < $w_size) { &ppstr(53, &pstr(71, $write_depth, $w_size ) ); } unless (open($$p_rhandle, "+<$rname")) { $err = &pstr(70, $rname, $! ); next Err; } unless (binmode($$p_rhandle)) { $err = &pstr(39, $rname, $! ); next Err; } unless (&FlockEx($p_rhandle, 6)) { $err = &pstr(76, $rname, $! ); next Err; } unless (seek($$p_rhandle, $read_depth, 0)) { $err = &pstr(72, $read_depth, $rname, $! ); next Err; } } return ($err, $p_rhandle, $p_whandle); } sub Suspend { my ($self) = @_; my ($p_rhandle, $rname, $p_whandle, $wname, $p_ehandle, $ename) = ($self->{'p_rhandle'}, $self->{'rname'}, $self->{'p_whandle'}, $self->{'wname'}, $self->{'p_ehandle'}, $self->{'ename'}); my $err = ''; Err: { my ($read_depth, $write_depth) = (0, 0); # close the reading filehandle: $read_depth = tell($$p_rhandle); if (-1 == $read_depth) { $err = "unable to determine read depth on in-progress readfile - $! - $^E"; next Err; } $err = &freeh( $p_rhandle, $rname ); next Err if ($err); # close the writing filehandle: $write_depth = tell($$p_whandle); unless (defined($write_depth)) { $err = "write depth not defined - $! - $^E"; next Err; } if (-1 == $write_depth) { $err = "unable to determine write depth on in-progress writefile - $! - $^E"; next Err; } $err = &freeh( $p_whandle, $wname ); next Err if ($err); # Call it a day... unless (seek($$p_ehandle, 0, 0)) { $err = &pstr(72,0,$ename,$!); next Err; } my $data = pack('LLL', $$, $read_depth, $write_depth); unless (print { $$p_ehandle } $data) { $err = &pstr( 43, $ename, $! ); next Err; } $err = &freeh( $p_ehandle, $ename ); next Err if ($err); # We're leaving these files behind, hoping that somebody will come along and call LockFile->Resume very soon. If they don't, though, then a human admin will have to come along and clean up the files. Make sure the permissions are set on files we've created/owned so that they'll be allowed to: chmod($::private{'file_mask'}, $ename); chmod($::private{'file_mask'}, $rname); last Err; } continue { # there was some error - try to nuke the $ename file just to be safe unlink($ename) if (-e $ename); } return $err; } sub Merge { my ($self) = @_; my ($p_rhandle, $rname, $p_whandle, $wname, $p_ehandle, $ename) = ($self->{'p_rhandle'}, $self->{'rname'}, $self->{'p_whandle'}, $self->{'wname'}, $self->{'p_ehandle'}, $self->{'ename'}); my $err = ''; Err: { my $abort = 0; # Release the read lock on $readfile on close it: $err .= &freeh($p_rhandle,$rname); $abort = 1 if ($err); # Request an exclusive write lock on $readfile (waits for other processes with shared locks to finish up...) my $write_filehandle = *WRITE; $write_filehandle = *WRITE; unless (open($write_filehandle, "+<$rname")) { $err .= &pstr(70, $rname, $! ); $abort = 1; } else { my $attempts = $self->{'timeout'}; my $success = 0; Try: while ($attempts > 0) { if (&FlockEx(\$write_filehandle, 6)) { $success = 1; last Try; } $attempts--; sleep(1); } unless ($attempts > 0) { $err .= &pstr(76, $wname, $! ); } # Got an exclusive lock? Good. Release it and kill readfile. $err .= &freeh( \$write_filehandle, $wname ); } unless ($abort) { unless (unlink($rname)) { $err .= &pstr(54,$rname,$!); $abort = 1; } } # Replace readfile with writefile $err .= &freeh( $p_whandle, $wname ); $abort = 1 if ($err); unless ($abort) { unless (rename($wname, $rname)) { $err .= &pstr(38,$wname,$rname,$!); $abort = 1; } } chmod($::private{'file_mask'}, $rname); # Call it a day... $err .= &freeh($p_ehandle,$ename,1); } return $err; } sub timegm { my ($sec, $min, $hours, $mday, $month, $year, $p_timecache) = @_; if ($month =~ m!\D!) { my $n = 0; $month = lc($month); foreach ('jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec') { last if ($month eq $_); $n++; } $month = $n % 12; } if ($year < 100) { # Handle two-digit years: # since our effective range is only 1970 to 2037, necessity dictates the following: if ($year > 70) { $year += 1900; } else { $year += 2000; } } if (($year < 1970) or ($year > 2037)) { return 0; } # convert back to base-1900 to prevent overflows: $year -= 1900; my $base_time_at_mon_year = &basetime($month, $year, $p_timecache); if ($base_time_at_mon_year == -1) { return 0; } return ($base_time_at_mon_year) + ($sec) + ($min * 60) + (3600 * $hours) + (86400 * ($mday - 1)); } sub timelocal { my $gtime = &timegm(@_); return 0 unless ($gtime); # Calculate seconds offset between localtime and gmtime my $testtime = $gtime; # If we're anywhere near a year boundary, shift up by a day or two: my $yday = (gmtime($testtime))[7]; if (($yday < 2) or ($yday > 360)) { $testtime += 86400 * 15; } my @lt = localtime($testtime); my @gt = gmtime($testtime); my $offset = ($lt[0] - $gt[0]) + 60 * ($lt[1] - $gt[1]) + 3600 * ($lt[2] - $gt[2]) + 86400 * ($lt[7] - $gt[7]); my $ltime = $gtime - $offset; # kludge kludge kludge... I hate this ... this is a +/- 1 search pattern in case our response doesn't agree with what they input. this corrects for some weird crazyness surrounding gmtime vs localtime while daylight savings time is propagating between them. if ((localtime($ltime))[2] != $_[2]) { $ltime -= 3600; } if ((localtime($ltime))[2] != $_[2]) { $ltime += 2 * 3600; } return $ltime; } sub basetime { my ($month, $year, $p_timecache) = @_; my $time = -1; Err: { if (($p_timecache) and ('HASH' eq ref($p_timecache))) { my $key = pack('LC', $year, $month); last Err if ($time = $$p_timecache{$key}); } my $guess_time = time(); my ($guess_month, $guess_year) = (gmtime($guess_time))[4,5]; my $yeardiff = $guess_year - $year; my $mondiff = $guess_month - $month; $guess_time -= (366 * 86400) * $yeardiff; $guess_time -= (31 * 86400) * (1 + $mondiff); $guess_time = 0 if ($guess_time < 0); # Okay, no $guess_time should lie sometime before the start of $month/$year. We took that extra month just in case. # Now step forward by 25-day increments until $guess_time returns a matching $month/year while (1) { ($guess_month, $guess_year) = (gmtime($guess_time))[4,5]; last Err unless (defined($guess_month)); last Err unless (defined($guess_year)); last if (($guess_month == $month) and ($guess_year == $year)); $guess_time += 25 * 86400; last Err if ($guess_year > $year); } # Take $guess_time down to the time the month/year started: my ($sec, $min, $hour, $mday) = gmtime($guess_time); $guess_time -= ( $sec + 60 * $min + 3600 * $hour + 86400 * ($mday - 1) ); if (($p_timecache) and ('HASH' eq ref($p_timecache))) { my $key = pack('LC', $year, $month); $$p_timecache{$key} = $guess_time; } $time = $guess_time; } return $time; } sub uri_merge { my ($v_base, $str) = @_; my $err = ''; my $clean = ''; Err: { local $_; if ('ARRAY' ne ref($v_base)) { $v_base = [ &uri_parse( $v_base ) ]; # anonymous array reference to return values } if ($v_base->[0]) { # there was an error in parsing the base URL # the $str can be returned as $clean iff it validated on its own ($err, $clean) = &uri_parse( $str ); last Err unless ($err); # oh.. there was an error - how do we explain this to our end user? # don't worry too much about the format of this string. it is *extremely* rare for us to arrive at a situation where # the $base_url is not valid in our context. the only case would be when parsing an HTML document which contains a tag # that is malformed or that uses an unsupported protocol like https://xav.com/ # # we only go critical and print the $err from uri_merge in cases of HTTP redirects, and so the $base_url-is-invalid scenario is # unlikely to arise in that context $err = qq!unable to merge URL with fragment. The primary URL failed to validate with:

Error: $v_base->[0].

Because the primary URL failed, the fragment could only be evaluated as a stand-alone URL. It failed that evaluation with:

Error: $err!; next Err; } # okay - more general case - base_url valid local $_ = $str; if (m!^/!) { # absolute link from top-level directory $_ = 'http://' . $v_base->[2] . ':' . $v_base->[3] . $_; } elsif (m!^\#!) { # a relative link on this page. just strip any current frag and append this one $_ = 'http://' . $v_base->[2] . ':' . $v_base->[3] . $v_base->[4] . $v_base->[5] . $_; } elsif (m!^\w+\:!) { # a protocol link. this link stands on its own as $_ } else { # relative link $_ = $v_base->[7] . $_; } ($err, $clean) = &uri_parse( $_ ); next Err if ($err); last Err; } return ($err, $clean); } 1;