#!/usr/bin/perl # Xavatoria Indexed Search, Index Building Module # Copyright 1997 by Fluid Dynamics # # For latest version and help files, visit: # http://www.xav.com/scripts/xavatoria/ # __________________________________________________________________ # Enter the location on your system of the new index file: $Index_File = "/home/martialartsresource/www/filipino/index.txt"; # $baseurl is the web page that corresponds with files in your base # directory, which is called $basedir. Use absolute paths like shown, # not relative paths. Do NOT include trailing slashes. $baseurl = "http://64.33.44.189/anonftp/pub/eskrima/digests"; $basedir = "/home/martialartsresource/anonftp/pub/eskrima/digests"; # $extensions holds the file extensions that will be included in # searches. It's best to leave out ones like ".log" or ".cgi". Note # the special "\." delimiters, and their occurrence at both the # beginning and end of the set: $extensions = "\.txt\.html\.htm\.shtml\.stm\.ztml\.shtml\."; *$extensions = "\.n*\."; # Below are the files or directories that you do NOT want to be # searched. Note that they all have one blank space after the # file or directory, and that directories do not include trailing # slashes. Also note that we use the ".=" instead of the "=". $DMZ .= "/null "; #$DMZ .= "/usr/www/users/xav/cgi-bin "; #$DMZ .= "/usr/www/users/xav/counters "; # Below you list the files that your visitors really should see (why # not show off your best work, eh?). While the search results remain # honest as to whether or not terms were found, these files will have # their numerical ranking multiplied by the CRANK FACTOR, which should # be an integer between two and twenty. Only files can go here, not # directories. The same rules from above apply... $CRANK_FACTOR = 18; #$CRANK .= "/usr/www/users/xav/links.ztml "; #$CRANK .= "/usr/www/users/xav/scripts/index.html "; #$CRANK .= "/usr/www/users/xav/clients.html "; # Options for Weighted Search: # # All occurrences of a search term count as one point. The occurrence # of a term in the URL, Title, Meta keywords, or Meta description # can have added weight (equivalent to a multiplier per hit). Enter # the multipliers in the array below - the defaults are (4, 10, 10, 4). # If this makes no sense to you, just ignore it and leave the defaults # as they are - they work pretty well. Note that this will give extra # weight to those pages that have a properly formatted Title and Meta # tags, even if they contain the same basic information (kinda like the # real search engines). $Filename_Rank = 4; $Title_Rank = 10; $Keyword_Rank = 10; $Description_Rank = 4; # No further editing is necessary, but feel free to play around... # # __________________________________________________________________ print "Content-type: text/plain\n\n"; if ($ENV{'SERVER_SOFTWARE'} =~ /IIS/) { $slash = '\\'; } else { $slash = '/'; } unless (-e $basedir) { print "Fatal Error!\n"; print "Searched for a directory at specified location:\n"; print " $basedir\n"; print "No directory found. Check settings.\n"; exit; } @directories = ($basedir); $maxvar = 1; for ($i=0;$i<$maxvar;$i++) { $directory = @directories[$i]; opendir(DIR,$directory); @entries = readdir(DIR); closedir(DIR); foreach $entry (@entries) { next if (($entry eq ".") || ($entry eq "..")); $full = $directory . $slash . $entry; next if ($DMZ =~ /$full /i); if (-d $full) { push(@directories,$full); $maxvar++; } elsif ((-T $full) && ($full =~ /(.*)\.(.*)/)) { push(@FILES,$full); # push(@FILES,$full) if ($extensions =~ /\.$2\./); } } } $Number_Files = @FILES; print " $Number_Files\n"; $delimiter = '%%==%%'; open(ALLFILES,">$Index_File") || &Fatal; foreach $FILE (@FILES) { open(FL,"$FILE"); $string = join(' ',); close(FL); $string =~ s/\n/ /g; ($Title,$Description) = &Extract_Meta; $Size = -s $FILE; $last = &Last_Modified($FILE); $webname = $FILE; $webname =~ s/\\/\//g; if ($webname =~ /^$basedir(.*)/) { $webname = $1; } $URL = $baseurl . $webname; print ALLFILES $CRANK_FACTOR if ($CRANK =~ /$FILE/); print ALLFILES $delimiter . $URL . $delimiter; # print ALLFILES $Title . $delimiter; print ALLFILES $URL . $delimiter; print ALLFILES $Description . $delimiter; print ALLFILES $Size . $delimiter . $last . $delimiter; for ($i=0; $i<$Filename_Rank; $i++) { $string .= " $URL"; } $string = " " . $string . " "; $string =~ s/\W/ /g; $string =~ s/_/ /g; $string =~ s/\s+/ /g; print ALLFILES $string . "\n"; } close(ALLFILES); print "Done - Captured $Number_Files files total.\n"; $Index_Size = -s $Index_File; print "Index file is $Index_Size bytes.\n\n"; sub Last_Modified # This procedure was written by Jeff Carnahan of Terminal Productions # (http://www.terminalp.com) { $filename = shift; ($sec,$min,$hour,$mday,$mon,$year) = localtime((stat($filename))[9]); @months = (Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec); return "$mday $months[$mon] $year"; } sub Extract_Meta { $string .= " || "; ($Title,$Description) = ("",""); if ($string =~ /([^<]*)<\/title>/i) { $Title = $1; $string =~ s/<title>([^<]*)<\/title>/ /oi; for ($i=0; $i<$Title_Rank; $i++) { $string .= " " . $Title; } } else { $Title = $FILE; } if ($string =~ /<meta\s+name="keywords"\s+content="([^\"]*)">/i) { for ($i=0; $i<$Keyword_Rank; $i++) { $string .= " " . $1; } } if ($string =~ /<meta\s+name="description"\s+content="([^\"]*)">/i) { $Description = $1; for ($i=0; $i<$Description_Rank; $i++) { $string .= $Description; } } $string =~ s/<[^>]*\s+ALT\s*=\s*"(([^>"])*)"[^>]*>/ $1 /ig; $string =~ s/<([^>]|\n)*>/ /g; if ($Description eq "") { @words = split(/\s+/,$string); for ($i=23; $i<80; $i++) # for ($i=0; $i<25; $i++) { last if ($words[$i] eq "||"); $Description .= "$words[$i] "; } $Description .= "..."; } $string =~ s/\W/ /g; $string =~ s/_/ /g; $Title =~ s/\s+/ /g; if ($Title =~ /^ (.*)/) { $Title = $1; } $Description =~ s/\s+/ /g; if ($Description =~ /^ (.*)/) { $Description = $1; } return($Title,$Description); } sub Fatal { print "Fatal Error! Cannot write to index file.\n"; print "This script does not have permission to modify the file:\n"; print " $Index_File.\n"; print "Build Module aborted.\n\n"; print "Hint: Try:\n\n"; print "chmod 777 $Index_File\n\n"; exit; }