diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/gorg/base.rb | 602 | ||||
-rw-r--r-- | lib/gorg/cache.rb | 493 | ||||
-rwxr-xr-x | lib/gorg/cgi-bin/gorg.cgi | 45 | ||||
-rwxr-xr-x | lib/gorg/cgi-bin/search.cgi | 50 | ||||
-rw-r--r-- | lib/gorg/cgi.rb | 198 | ||||
-rwxr-xr-x | lib/gorg/fcgi-bin/gorg.fcgi | 61 | ||||
-rw-r--r-- | lib/gorg/log.rb | 56 | ||||
-rw-r--r-- | lib/gorg/search.rb | 444 | ||||
-rw-r--r-- | lib/gorg/www.rb | 207 |
9 files changed, 2156 insertions, 0 deletions
diff --git a/lib/gorg/base.rb b/lib/gorg/base.rb new file mode 100644 index 0000000..c3851a9 --- /dev/null +++ b/lib/gorg/base.rb @@ -0,0 +1,602 @@ +### Copyright 2004, Xavier Neys (neysx@gentoo.org) +# # +# # This file is part of gorg. +# # +# # gorg is free software; you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation; either version 2 of the License, or +# # (at your option) any later version. +# # +# # gorg is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with Foobar; if not, write to the Free Software +### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +module Gorg + Version = "0.6" +end + +# Some required stuff for gorg +require 'time' + +require 'gorg/xsl' +require 'gorg/log' +require 'gorg/cache' +require 'timeout' +require 'cgi' +require 'stringio' +require 'zlib' +require 'ipaddr' + + +module Gorg + + def xproc(path, params, list=false, printredirect=false) + # Process file through xslt passing params to the processor + # path should be the absolute path of the file, i.e. not relative to DocumentRoot + # + # Since 0.4, path can also be a string containing + # the actual xml to be processed + # + # Use default stylesheet if none can be found in the file + # Return a list of files read by the processor (useful to do caching) if requested + # + # Return an error condition and, hopefully, some useful output + # Do not raise any exception + # In most cases, an error will result in no output but + # the xslt processor can consider some errors as warnings and + # return the best result it could come up with along with a warning + # e.g. if a file used in a document() function cannot be found, + # the xslt processor will return some output and a warning. + # It's up to the caller to decide whether to use the output or b0rk + # + # The return value is an array of 2 to 4 items: [{}, "", [[]], []] + # 1. hash with error information, its keys are + # 1.a "xmlErrCode" 0 is no error, -9999 means an exception has been raised in this block (unlikely), + # anything else is an error code (see /usr/include/libxml2/libxml/xmlerror.h) + # 1.b "xmlErrLevel" again, from libxml2, 0==OK, 1==Warning, 2==Error, 3==Fatal + # 1.c "xmlErrLevel" again, from libxml2, some explanation about what went wrong + # 2. output from xsltprocessor (or error message from a raised exception) + # 3. list of files that the xslt processor accessed if the list was requested, + # paths are absolute, i.e. not relative to your docroot. + # Each entry is an array [access type, path] with access_type being + # "r" for read, "w" for written (with exsl:document) or "o" for other (ftp:// or http://) + # 4. array of CGI::Cookie to be sent back + # + # Examples: [{"xmlErrMsg"=>"blah warning blah", "xmlErrCode"=>1509, "xmlErrLevel"=>1}, "This is the best XSLT could do!", nil] + # [{"xmlErrCode"=>0}, "Result of XSLT processing. Well done!", ["/etc/xml/catalog","/var/www/localhost/htdocs/doc/en/index.xml","/var/www/localhost/htdocs/dtd/guide.dtd"]] + + xsltproc = Gorg::XSL.new + xsltproc.xroot = $Config["root"] + # Grab strings from xsl:message + xslMessages = [] + # Does the caller want a list of accessed files? + xsltproc.xtrack = list; filelist = Array.new + # Process .xml file with stylesheet(s) specified in file, or with default stylesheet + xsltproc.xml = path + # Look for stylesheet href (there can be more than one) + regexp = Regexp.new('<\?xml-stylesheet.*href="([^"]*)".*') + l = $Config["headXSL"] ; styles = Array.new + if FileTest.file?(path) then + # Path is indeed a file name + IO.foreach(path) { |line| + styles << $1 if regexp.match(line) + break if (l-=1) == 0 + } + else + # Scan xml for stylesheet names + path.each { |line| styles << $1 if regexp.match(line) } + end + # Use default stylesheet if none were found in the doc + styles << $Config["defaultXSL"] if styles.length == 0 + # Add params, we expect a hash of {param name => param value,...} + xsltproc.xparams = params + # Process through list of stylesheets + firstErr = {} + while xsltproc.xsl = styles.shift + xsltproc.process + filelist += xsltproc.xfiles if xsltproc.xtrack? + # Break and raise 301 on redirects + xsltproc.xmsg.each { |r| + if r =~ /Redirect=(.+)/ then + if printredirect then + STDERR.puts "Location: #{$1}" + else + raise Gorg::Status::MovedPermanently.new($1) + end + end + } + xslMessages += xsltproc.xmsg + # Remember 1st warning / error + firstErr = xsltproc.xerr if firstErr["xmlErrLevel"].nil? && xsltproc.xerr["xmlErrLevel"] > 0 + # B0rk on error, an exception should have been raised by the lib, but, er, well, you never know + break if xsltproc.xerr["xmlErrLevel"] > 1 + xsltproc.xml = xsltproc.xres + end + # Keep 1st warning / error if there has been one + firstErr = xsltproc.xerr if firstErr["xmlErrLevel"].nil? + # Return values + [ firstErr, xsltproc.xres, (filelist.uniq if xsltproc.xtrack?), xslMessages ] + rescue => ex + if ex.respond_to?(:errCode) then + # One of ours (Gorg::Status::HTTPStatus) + # Propagate exception + raise + else + debug "in xproc exception handler: #{ex.inspect} // #{xsltproc.xerr.inspect}" + # Return exception message and an error hash as expected from the xslt processor + # Use error codes that the xslt lib might have returned + [ if (xsltproc.xerr["xmlErrCode"]||-1) == 0 then + { "xmlErrMsg" => ex.to_s, + "xmlErrCode" => 9999, + "xmlErrLevel" => 3 + } + else + { "xmlErrMsg" => xsltproc.xerr["xmlErrMsg"] || ex.to_s, + "xmlErrCode" => xsltproc.xerr["xmlErrCode"], + "xmlErrLevel" => xsltproc.xerr["xmlErrLevel"] + } + end , + ex.to_s, + (filelist.uniq if xsltproc.xtrack?) + ] + end + end + + # HTTP status codes and html output + module Status + class HTTPStatus < StandardError + def html(err="") + <<-EOR +<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN"> +<HTML> +<HEAD><TITLE>#{errSts}</TITLE></HEAD> +<BODY> +<H1>#{errLabel}</H1> +<font color="#FF0000">#{err}</font> +<HR> +</BODY> +</HTML> + EOR + end + def errSts + "#{errCode} #{errLabel}" + end + # Default is unknown error + def errLabel + "Undefined Error" + end + def errCode + 999 + end + def header + {'Status' => errSts} + end + end + + class NotModified < HTTPStatus + def initialize(stat) + # 304 needs to send ETag and Last-Modified back + @mstat=stat + end + def header + {'Last-Modified' => @mstat.mtime.httpdate.dup, 'ETag' => makeETag(@mstat).dup}.merge(super) + end + def html + "" + end + def errLabel + "Not Modified" + end + def errCode + 304 + end + end + + class MovedPermanently < HTTPStatus + def initialize(loc) + # 301 needs to send Location: + @location=loc + end + def errLabel + "Moved Permanently" + end + def errCode + 301 + end + def header + {'Location' => @location}.merge(super) + end + def html + # RFC says "should" not "must" add a body + "" + end + def html301 # Not used + <<-EO301 +<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN"> +<html><head> +<title>301 Moved Permanently</title> +</head><body> +<h1>Moved Permanently</h1> +<p>The document has moved <a href="#{@location}">here</a>.</p> +</body></html> + EO301 + end + end + + class Forbidden < HTTPStatus + def errLabel + "Forbidden" + end + def errCode + 403 + end + end + + class NotFound < HTTPStatus + def errLabel + "Not Found" + end + def errCode + 404 + end + end + + class NotAllowed < HTTPStatus + def errLabel + "Method Not Allowed" + end + def header + {'Allow'=>'GET,HEAD'}.merge(super) + end + def errCode + 405 + end + end + + class SysError < HTTPStatus + def errLabel + "Internal Server Error" + end + def errCode + 500 + end + end + end #Status module + + + def gorgInit + # Initialize gorg, i.e. read config file, init cache, ... + # Simply build a hash of params => value in a global variable called $Config + + # Set up default values + $Config = { "AppName" => "gorg", # Used for syslog entries, please keep 'gorg' (cannot be changed in config file) + "root" => nil, # No root dir by default (cgi uses DOCUMENT_ROOT from its environment) + "port" => 8000, # Used for stand-alone web server (WEBrick) + "headXSL" => 12, # Only read 12 lines in xml files to identify required stylesheets + "defaultXSL" => nil, # No default stylesheet, how could I guess? + "cacheDir" => nil, # No cache by default. Directory must exist and be writable. + "cacheTTL" => 0, # Number of seconds after which a document is considered too old, 0=never + "cacheSize" => 40, # in MegaBytes, max size of cache, used when autocleanig + "zipLevel" => 2, # Compresion level used for gzip support (HTTP accept_encoding) (0-9, 0=none, 9=max) + "maxFiles" => 9999, # Max number of files in a single directory in the cache tree + "cacheTree" => 0, # Use same tree as on site in cache, 0 = disabled + "cacheWash" => 0, # Clean cache automatically and regularly when a store into the cache occurs. 0 = disabled + # gorg cleans up if random(param_value) < 10. It will only clean same dir it caches to, not whole tree. + # i.e. a value<=10 means at every call (not a good idea), 100 means once/10 stores, 1000 means once/100 stores + "logLevel" => 4, # INFO, be slightly verbose by default (messages go to syslog) OFF, FATAL, ERROR, WARN, INFO, DEBUG = 0, 1, 2, 3, 4, 5 + "passthru" => true, # Allow return of requested file without processing it if passthru="anything but 0" is passed + "acceptCookies" =>false,# Allow cookies in & out of transforms + "linkParam" => "link", # Pass pathname of requested file in 'link' param to xsl transform + "HTTP_HOST" => nil, # Pass host value from HTTP header to xsl transform + "accessLog" => "syslog",# or a filename or STDERR, used to report hits from WEBrick, not used by cgi's + "autoKill" => 0, # Only used by fastCGI, exit after so many requests (0 means no, <=1000 means 1000). Just in case you fear memory leaks. + "in/out" => [], # (In/Ex)clude files from indexing + "mounts" => [], # Extran mounts for stand-alone server + "listen" => "127.0.0.1" # Let webrick listen on given IP + } + # Always open syslog + @syslog = Gorg::Log::MySyslog.new($Config["AppName"]) + $Log = Gorg::Log::MyLog.new(@syslog, 5) # Start with max + + # Check for config file + configf = ENV["GORG_CONF"]||"/etc/gorg/gorg.conf" + raise "Cannot find config file (#{configf})" unless FileTest.file?(configf) and FileTest.readable?(configf) + file = IO.read(configf) + parseConfig($Config, file) + + # Init cache + Cache.init($Config) if $Config["cacheDir"] + + # Set requested log level + $Log.level = $Config["logLevel"] + rescue + error("Gorg::init failed: #{$!}") + STDERR.puts("Gorg::init failed: #{$!}") + exit(1) + end + + def scanParams(argv) + # Scan argv for --param paramName paramValue sequences + # params are removed from argv + # Return a hash of {"name" => "value"} + h = Hash.new + while idx = argv.index('--param') + break if argv.length <= idx+2 # We need at least 2 more args after --param + argv.delete_at(idx) # Remove --param from argv + name = argv.delete_at(idx) # Remove param name from argv + value = argv.delete_at(idx) # Remove param value from argv + h[name] = value # Add entry in result + end + + h if h.length > 0 + end + + private + def parseConfig(h, config) + config.each {|line| + line.strip! + next if line.length == 0 or line[0,1] == '#' # Skip blank lines and comments + raise "Invalid Configuration (#{line})" unless line =~ /^([a-zA-Z_]*)\s*=\s*/ + param = $1 + value = $' + # If value starts with ' or ", it ends with a similar sign and does not accept any in the value, no escaping... We keep it simple + # otherwise, it ends with EOL or first space + if value =~ /["'](.*)['"]/ then + value = $1 + end + value.strip! + raise "No value for #{param}" unless value.length > 0 + # Check param / value (only syntactical checks here) + case param.downcase + when "root" + h["root"] = value + when "port" + h["port"] = value.to_i + when "passthru" + h["passthru"] = value.squeeze != "0" + when "acceptcookies" + h["acceptCookies"] = value.squeeze == "1" + when "linkparam" + if value =~ /^\s*([a-zA-Z]+)\s*$/ then + h["linkParam"] = $1 + else + h["linkParam"] = nil + end + when "httphost" + hosts = value.squeeze(" ") + case hosts + when /^0?$/ + hh = nil + when "*" + hh = ["*"] + else + hh = hosts.split(" ") + # Add IPs + hosts.split(" ").each { |ho| + begin + hh += TCPSocket.gethostbyname(ho)[3..-1] if ho != '*' + rescue + # Ignore + nil + end + } + hh.uniq! + end + h["httphost"] = hh + when "headxsl" + h["headXSL"] = value.to_i + when "defaultxsl" + h["defaultXSL"] = value + when "cachedir" + h["cacheDir"] = value + when "cachettl" + h["cacheTTL"] = value.to_i + when "cachesize" + h["cacheSize"] = value.to_i + when "maxfiles" + h["maxFiles"] = value.to_i + when "cachetree" + h["cacheTree"] = value.squeeze != "0" + when "ziplevel" + if value =~ /^\s*([0-9])\s*$/ then + h["zipLevel"] = $1.to_i + else + h["zipLevel"] = 2 + end + when "cachewash" + h["cacheWash"] = value.to_i + when "loglevel" + h["logLevel"] = value.to_i + when "accesslog" + h["accessLog"] = value + when "autokill" + h["autoKill"] = value.to_i + when "listen" + begin + ip = IPAddr.new(value) + h["listen"] = ip.to_s + rescue + h["listen"] = "127.0.0.1" + end + when "dbconnect" + h["dbConnect"] = value + when "dbuser" + h["dbUser"] = value + when "dbpassword" + h["dbPassword"] = value + when "exclude" + h["in/out"] << [false, Regexp.new(value)] + when "include" + h["in/out"] << [true, Regexp.new(value)] + when "fpath_to_lang" + h["flang"] = Regexp.new(value) + when "xpath_to_lang" + h["xlang"] = value + when "mount" + if value =~ /^([^\s]+)\s+ON\s+(.+)$/i then + h["mounts"] << [$1, $2] + end + else + raise "Unknown parameter (#{param})" + end + } + rescue + raise "Could not parse config file: #{$!}" + end + + # Utilities + def contentType(aMsg) + # Find the Content-Type=xxx/yyy line in aMsg + # from the Meta file in the cache + ct = nil + aMsg.each { |s| + if s =~ /^Content-Type:(.+)$/ then + ct = $1 + break + end + } + ct + end + + def setContentType(data) + # Set content-type according to x(ht)ml headers + charset = nil + if data =~ /^<\?xml .*encoding=['"](.+)['"]/i then + charset = $1 if $1 + # XML / XHTML + if data[0..250] =~ /^<\!DOCTYPE\s+html/i then + # XHTML + ct = 'application/xhtml+xml' + else + # XML + ct = 'text/xml' + end + if charset then + ct << "; charset=#{charset}" + end + elsif data =~ /^<\!DOCTYPE\s+html\sPUBLIC\s(.+DTD XHTML)?/i then + # (X)HTML + if $1 then + # XHTML + ct = 'application/xhtml+xml' + else + # HTML + ct = 'text/html' + end + elsif data =~ /<html/i then + # HTML + ct = 'text/html' + else + # TXT + ct = 'text/plain' + end + ct + end + + def makeCookies(aMsg) + # Make an array of CGI::Cookie objects + # msg is expected to be an array of strings like 'Set-Cookie(name)value=param' + # (output by the xsl transform with xsl:message) + cookies = Hash.new + aMsg.each { |s| + if s =~ /^Set-Cookie\(([^\)]+)\)([a-zA-Z0-9_-]+)=(.+)$/ then + # $1 = cookie name $2 = key $3 = value + if cookies.has_key?($1) then + cookies[$1] << "#{$2}=#{$3}" + else + cookies[$1] = ["#{$2}=#{$3}"] + end + end + } + if cookies.length > 0 then + # Make CGI::Cookie objects + cookies.map { |k,v| + CGI::Cookie.new('name' => k, 'value' => v, 'expires' => Time.now + 3600*24*30) + } + else + nil + end + end + + def cookies_to_params(cookies) + # Turn array of CGI::Cookie objects into a Hash of key=>value + # cookies is a hash, forget the keys, + # each value should be an array of strings, each string should be like 'param=value' + h = {} + cookies.values.each { |v| + if v.class==String and v =~ /^([a-zA-Z0-9_-]+)=(.+)$/ then + h[$1] = $2 + elsif v.class==Array then + v.each { |vv| + if vv.class==String and vv =~ /^([a-zA-Z0-9_-]+)=(.+)$/ then + h[$1] = $2 + end + } + elsif v.class==CGI::Cookie then + v.value.each { |vv| + if vv.class==String and vv =~ /^([a-zA-Z0-9_-]+)=(.+)$/ then + h[$1] = $2 + end + } + end + } + h + rescue + error "Could not parse cookies (#{$!}) " + {} + end + + def notModified?(fstat, etags, ifmodsince) + # Decide whether file has been modified according to either etag, last mod timestamp or both + # If both If-None-Match and If-Modified-Since request header fields are present, + # they have to be tested both + res = false + if fstat then + a = etags.to_a + if ifmodsince && etags then + res = (ifmodsince >= fstat.mtime) && (a.include?(makeETag(fstat)) || a.include?('*')) + elsif etags + res = a.include?(makeETag(fstat)) || a.include?('*') + elsif ifmodsince + res = ifmodsince >= fstat.mtime + end + end + # Return result + res + end + + def split_header_etags(str) + # Split header values expected as "value1", "value2", ... into an array of strings + str.scan(/((?:"(?:\\.|[^"])+?"|[^",]+)+)(?:,\s*|\Z)/xn).collect{|v| v[0].strip } + end + + def makeETag(st) + # Format file stat object into an ETag using its size & mtime + # Parameter can either be a filename or a stat object + st = File.stat(st) unless st.respond_to?(:ino) + sprintf('"%x-%x"', st.size, st.mtime.to_i) + end + + def gzip(data, level) + gz = "" + io = StringIO.new(gz) + gzw = Zlib::GzipWriter.new(io, level) + gzw.write data + gzw.close + gz + end + + def gunzip(data) + io = StringIO.new(data) + gzw = Zlib::GzipReader.new(io) + gunz = gzw.read + gzw.close + gunz + end + +end diff --git a/lib/gorg/cache.rb b/lib/gorg/cache.rb new file mode 100644 index 0000000..543b6a2 --- /dev/null +++ b/lib/gorg/cache.rb @@ -0,0 +1,493 @@ +### Copyright 2004, Xavier Neys (neysx@gentoo.org) +# # +# # This file is part of gorg. +# # +# # gorg is free software; you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation; either version 2 of the License, or +# # (at your option) any later version. +# # +# # gorg is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with Foobar; if not, write to the Free Software +### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +# Cache a bit of data based on +# . a path name as received by a webserver e.g. +# . a list of parameters as received by a webserver e.g. +# . a list of files it depends on + +require "parsedate" +require "fileutils" +require "find" +require "digest" +require "digest/md5" + +module Gorg + +CacheStamp = "Gorg-#{Gorg::Version} Cached This Data. Do not alter this file. Thanks." + +module Cache + def Cache.init(config) + @@lockfile = ".cache.cleaner.lock" + @cacheDir = nil + if FileTest.directory?(config["cacheDir"]) + if FileTest.writable?(config["cacheDir"]) + @cacheDir = config["cacheDir"].chomp("/") + else + warn "Cache directory not writable" + end + else + warn "Invalid cache directory" + end + + # Time-To-Live in seconds, cached items older than that will be considered too old + @zipLevel = config["zipLevel"] + @zip = @zipLevel > 0 ? ".gz" : "" + @ttl = config["cacheTTL"] + @cacheTree = config["cacheTree"] + @maxFiles = config["maxFiles"] # Max number of files in a single directory + @maxSize = config["cacheSize"]*1024*1024 # Now in bytes + @washNumber = config["cacheWash"] # Clean cache dir after a store operation whenever rand(@washNumber) < 10 + @lastCleanup = Time.new-8e8 # Remember last time we started a cleanup so we don't pile them up + end + + def Cache.hit(objPath, objParam={}, etags=nil, ifmodsince=nil) + # objPath is typically a requested path passed from a web request but it + # can be just any string. It is not checked against any actual files on the file system + # + # objParam is expected to be a hash or any object whose iterator yields two values + # + # 2 filenames are built with the arguments and should give + # the name of a metafile and a result file + # if the result file is older than @ttl seconds, hit fails + # The metafile is then checked for dependencies + # It contains a list of filenames along with their size and mtime separated by ;; + + # etag and ifmodsince are used in a webserver context + # etag is defined if an ETag was part of an If-None-Match request field + # etag can be an array or a single string + # If the current ETag of the meta file matches, no data is returned (webserver should return a 304) + # + # ifmodsince is a time object passed on an If-Modified-Since request field + # If the creation date of the meta file is earlier, no data is returned (webserver should return a 304) + + return nil if @cacheDir.nil? # Not initialized, ignore request + + # Reminder: filenames are full path, no need to prepend dirname + dirname, basename, filename, metaname = makeNames(objPath, objParam) + + raise "Cache subdir does not exist" unless FileTest.directory?(dirname) + + # Hit the cache + meta, mstat = IO.read(metaname), File.stat(metaname) if metaname && FileTest.file?(metaname) && FileTest.readable?(metaname) + raise "Empty/No meta file" if meta.nil? || meta.length < 1 + + fstat = File.stat(filename) if filename && FileTest.file?(filename) + raise "Empty/No data file" if fstat.nil? + + # Check the timestamps of files in the metadata + meta = meta.split("\n") + raise "I did not write that meta file" unless CacheStamp == meta.shift + mline = meta.shift + while mline and mline !~ /^;;extra meta$/ do + f, s, d = mline.split(";;") + if s.to_i < 0 + # File did not exist when cache entry was created + raise "Required file #{f} has (re)appeared" if FileTest.file?(f) && FileTest.readable?(f) + else + # File did exist when cache entry was created, is it still there? + raise "Required file #{f} has disappeared" unless FileTest.file?(f) && FileTest.readable?(f) + + fst = File.stat(f) + raise "Size of #{f} has changed from #{fst.size} to #{s.to_i}" unless fst.size == s.to_i + raise "Timestamp of #{f} has changed" unless Time.utc(*ParseDate.parsedate(d)) == fst.mtime.utc + end + mline = meta.shift + end + if mline =~ /^;;extra meta$/ then + extrameta = meta.dup + else + extrameta = [] + end + + if notModified?(fstat, etags, ifmodsince) and extrameta.join !~ /set-cookie/i + raise Gorg::Status::NotModified.new(fstat) + end + + file = IO.read(filename) if filename && FileTest.file?(filename) && FileTest.readable?(filename) + raise "Empty/No data file" if file.nil? || file.length < 1 + + # Is the data file too old + raise "Data file too old" unless @ttl==0 or (Time.new - fstat.mtime) < @ttl + + # Update atime of files, ignore failures as files might have just been removed + begin + t = Time.new + File.utime(t, fstat.mtime, filename) + File.utime(t, mstat.mtime, metaname) + rescue + nil + end + + # If we get here, it means the data file can be used, return cache object (data, stat(datafile), extrameta) + # The file is left (un)compressed, it's returned as it was stored + [file, fstat, extrameta] + + rescue Gorg::Status::NotModified + # Nothing changed, should return a 304 + debug("Client cache is up-to-date") + raise + rescue + # cache hit fails if anything goes wrong, no exception raised + debug("Cache hit on #{objPath} failed: (#{$!})") + nil + end + + + def Cache.store(data, objPath, objParam={}, deps=[], extrameta=[]) + # Store data in cache so it can be retrieved based on the objPath and objParams + # deps should contain a list of files that the object depends on + # as returnd by our xsl processor, i.e. an array of [access_type, path] where + # access_type can be "r", "w", or "o" for recpectively read, write, other. + + # Define content-type + ct = setContentType(data) + extrameta << "Content-Type:#{ct}" + + return nil if @cacheDir.nil? # Not initialized, ignore request + + # Cache only if no remote objects (ftp:// or http://) in list of used files + if deps && deps.detect{|f| f[0] =~ /^o$/i } + debug "#{objPath} not cached because it needs remote resources" + return nil + end + + dirname, basename, filename, metaname = makeNames(objPath, objParam) + + FileUtils.mkdir_p(dirname) unless FileTest.directory?(dirname) + + # Write Meta file to a temp file (with .timestamp.randomNumber appended) + metaname_t = "#{metaname}.#{Time.new.strftime('%Y%m%d%H%M%S')}.#{rand(9999)}" + + # Data might need to be just a link to another .Data file + # if we find another requested path with different params but + # with identical MD5 sums + # Which is why we keep a ...xml.Data.[md5 sum] file without the parameters + # in its name that we can hard link to. + # e.g. A moron hits for 10 full handbooks with toto=1..10 in the URI, + # we'd end up with 10 identical large copies. With links we have only one + + # Old versions are expected to be cleaned up by the cacheWash() routine + # A Dir.glob() to find the previous ones would be too expensive + + # Compute MD5 digest + md5 = Digest::MD5.hexdigest(data) + + # Compress data if required + if @zipLevel > 0 then + bodyZ = data = gzip(data, @zipLevel) + else + bodyZ = nil + end + + # Set mtime of data file to latest mtime of all required files + # so that caching can work better because mtimes will be + # identical on all webnodes whereas creation date of data + # would be different on all nodes. + maxmtime = Time.now-8e8 + fstat = nil + + begin + timeout(10){ + File.open("#{metaname_t}", "w") {|fmeta| + fmeta.puts(CacheStamp) + # Write filename;;size;;mtime for each file in deps[] + deps.each {|ffe| + ftype = ffe[0] + fdep = ffe[1] + if FileTest.file?(fdep) + s = File.stat(fdep) + fmeta.puts("#{fdep};;#{s.size};;#{s.mtime.utc};;#{ftype}") + maxmtime = s.mtime if s.mtime > maxmtime and ftype =~ /^r$/i + else + # A required file does not exist, use size=-1 and old timestamp + # so that when the file comes back, the cache notices a difference + # and no cache miss gets triggered as long as file does not exist + fmeta.puts("#{fdep};;-1;;Thu Nov 11 11:11:11 UTC 1971") + end + } + fmeta.puts ";;extra meta" + extrameta.each { |m| fmeta.puts m } + } + # Get exclusive access to the cache directory while moving files and/or creating data files + File.open(dirname) { |lockd| + while not lockd.flock(File::LOCK_NB|File::LOCK_EX) + # Timeout does not occur on a blocking lock + # Try a non-bloking one repeatedly for a few seconds until timeout occurs or lock is granted + # We are in a timeout block, remember + sleep 0.1 + end + # Remove previous Data + FileUtils.rm_rf(filename) + + # mv temp meta file to meta file + FileUtils.mv(metaname_t, metaname) + + # We keep a data file for the same requested path, with different params, + # but which ends up with same MD5 sum, i.e. identical results because of unused params + linkname = "#{basename}.#{md5}#{@zip}" + if FileTest.file?(linkname) then + # Data file already there, link to it + File.link(linkname, filename) + else + # Write data file and set its mtime to latest of all files it depends on + File.open("#{filename}", "w") {|fdata| fdata.write(data)} + # Create link + File.link(filename, linkname) + end + # mtime might need to be updated, or needs to be set + # e.g. when a dependency had changed but result files is identical + # This is needed to keep Last-Modified dates consistent across web nodes + File.utime(Time.now, maxmtime, filename) + fstat = File.stat(filename) + } + } + ensure + FileUtils.rm_rf(metaname_t) + end + + # Do we clean the cache? + washCache(dirname, 10) if @washNumber > 0 and rand(@washNumber) < 10 + + # Return stat(datafile) even if it's just been removed by washCache + # because another web node might still have it or will have it. + # Anyway, the cached item would be regenerated on a later request + # and a 304 would be returned if still appropriate at the time. + + # Return fstat of data file (for etag...) and zipped file + [fstat, bodyZ] + + rescue Timeout::Error, StandardError =>ex + if ex.class.to_s =~ /timeout::error/i then + warn("Timeout in cache store operation") + else + warn("Cache store error (#{$!})") + end + # Clean up before leaving + FileUtils.rm_rf(filename||"") + FileUtils.rm_rf(metaname||"") + nil # return nil so that caller can act if a failed store really is a problem + end + + + def Cache.washCache(dirname, tmout=30, cleanTree=false) + # Clean cache entries that are either too old compared to TTL (in seconds) + # or reduce total size to maxSize (in MB) + # oldDataOnly means to look only for unused *.Data.[md5] files that are not used anymore + # because file has been modified and has generated a new *.Data.[md5] file + + # timeout is the maximum time (in seconds) spent in here + + return nil if @cacheDir.nil? # Not initialized, ignore request + + # Also ignore request if dirname not equal to @cacheDir or under it + return nil unless dirname[0, @cacheDir.length] == @cacheDir + + # Also ignore request if dirname does not exist yet + return nil unless FileTest.directory?(dirname) + + # Also return if less than a minute has elapsed since latest cleanup + t0 = Time.new + return nil if t0 - @lastCleanup < 60 + + # Remember for next time + @lastCleanup = t0 + + Dir.chdir(dirname) { |d| + # Recreate lock file if it's been lost + unless File.exist?(@@lockfile) + File.open(@@lockfile, "w") { |lockf| lockf.puts("Lock file created on #{Time.now.utc} by gorg")} + end + + # Grab lockfile + File.open(@@lockfile) { |lockf| + if lockf.flock(File::LOCK_NB|File::LOCK_EX) then + infoMsg = "Cleaning up cache in #{dirname} (cleanTree=#{cleanTree}, tmout=#{tmout})" + info(infoMsg) + puts infoMsg if cleanTree + + timeout(tmout) { + totalSize, deletedFiles, scannedDirectories = washDir(dirname, cleanTree) + if totalSize >= 0 then + # Size == -1 means dir was locked, throwing an exception would have been nice :) + infoMsg = if cleanTree then + "Cache in #{dirname} is now #{totalSize/1024/1024} MB, #{deletedFiles} files removed in #{(Time.now-t0).to_i} seconds in #{scannedDirectories} directories" + else + "#{deletedFiles} files removed in #{(Time.now-t0).to_i} seconds in #{dirname}" + end + info(infoMsg) + puts infoMsg if cleanTree + end + } + else + # Locked dir, another process is busy cleaning up/ + debug("#{dirname} locked, skipping") + puts("#{dirname} locked, skipping") if cleanTree + end # of lock test + } # end of File.open(@@lockfile), close & release lock automatically + } + rescue Timeout::Error + info("Timeout while cleaning #{dirname}") + puts("Timeout while cleaning #{dirname}") if cleanTree + rescue StandardError =>ex + error("Error while cleaning cache: #{ex}") + puts("Error while cleaning cache: #{ex}") if cleanTree + end + + + private + + def Cache.washDir(dirname, cleanTree) + # Clean up cache starting from dirname and in subdirectories if cleanTree is true + # Return [newSize in bytes, # deleted files, # scanned directories] + size = nDeleted = nDirectories = 0 + + Dir.chdir(dirname) { |d| + hIno = Hash.new(0) # hash of file inodes with more than one link + lst = Array.new # array of file names, atime, ... + ttl = @ttl + ttl = 8e8 if ttl == 0 # No ttl, keep very old docs! + + # Get list of files sorted on their dirname+atime + Find.find('.') { |f| + begin + unless f =~ /^\.$|#{@@lockfile}/ # ignore "." and lockfile + ff = File.stat(f) + if ff.directory? then + Find.prune unless cleanTree + elsif ff.file? and f =~ /Meta|Data/ then + hIno[ff.ino] = ff.nlink if ff.nlink > 1 + # List of files has [name, atime, size, # links, inode] + lst << [f, ff.atime, ff.size, ff.nlink, ff.ino] + end + end + rescue + nil # File.stat can fail because file could have been deleted, ignore error + end + } + + # Compute total size + size = lst.inject(0){ |tot, a| tot + if a[3] > 0 then a[2]/a[3] else 0 end } + + # Delete old *.Data.[md5] files that are not being referenced anymore/ + lst.each { |a| + if a[3] == 1 && a[0] =~ /\.Data\.[0-9a-f]+(.gz)?$/ then + # Data file with no more links pointing to it + FileUtils.rm_rf(a[0]) + nDeleted += 1 + size -= a[2] + a[3] = 0 # Mark as deleted + end + } + + # Sort all files on atime + lst.sort!{ |a1, a2| a1[1] <=> a2[1] } + + t0 = Time.new + # Clean until size < maxSize _AND_ atime more recent than TTL + lst.each { |a| + break if size < @maxSize and t0-a[1] < ttl + next if a[3] < 1 # Already deleted in previous step + FileUtils.rm_rf(a[0]) + nDeleted += 1 + # Total size -= file size IF last link to data + if a[3] == 1 || hIno[a[4]] <= 1 then + size -= a[2] + end + hIno[a[4]] -= 1 if hIno[a[4]] > 0 + a[3] = 0 # Mark as deleted by setting nlinks to 0 + } + + # Remove deleted files from array + lst.reject! { |a| a[3] < 1 } + + + # Sort files per directory to enforce maxFiles + if cleanTree then + # Split the array in an array per directory + # and keep the files sorted on atime in each directory + slst = Hash.new + lst.length.times { + a = lst.shift + d = File.dirname(a[0]) + if slst[d] then + slst[d] << a + else + slst[d] = [a] + end + } + else + # If not cleaning whole tree, we have only a single dir + slst = {"." => lst} + end + + nDirectories = slst.length + + slst.each { |d, lst| + # Remove oldest files so that we have less than @maxFiles in it + if lst.length >= @maxFiles then + # Remove to leave up 90% of #maxFiles so we don't clean up only a handful of files repeatedly + (lst.length - 9*@maxFiles/10).times { + if a = lst.shift then + FileUtils.rm_rf(a[0]) + nDeleted += 1 + # Total size -= file size IF last link to data + if a[3] == 1 || hIno[a[4]] <= 1 then + size -= a[2] + end + hIno[a[4]] -= 1 if hIno[a[4]] > 0 + end + } + end + } + } #end of chdir + [size, nDeleted, nDirectories] + end + + + def Cache.makeNames(obj, params) + # Build meta filename and data filename from arguments + # + # obj is broken into a path and a filename with appended params + # e.g. /proj/en/index.xml?style=printable becomes /proj/en and index.xml+printable+yes + # or .#proj#en#index.xml+printable+yes + # depending on cacheTree param value + + # .Meta and .Data are appended respectively to the meta filename and data filename + # Base is the filename without appending params, e.g. .#proj#en#index.xml.Data + if @cacheTree then + # Use a path and a file + dir = "#{@cacheDir}#{File.dirname(obj)}" + base = f = File.basename(obj) + else + # Convert full path into a single filename + dir = @cacheDir + base = f = ".#{obj.gsub(/\//,'#')}" + end + + f = "#{f}+#{params.reject{|k,v| k.nil?}.sort.join('+')}" if params && params.to_a.length > 0 + # Remove funky chars and squeeze duplicates into single chars + f = f.gsub(/[^\w\#.+_-]/, "~").squeeze("~.#+") + + # Return names for Data and Meta files, and just the filepath (e.g. #proj#en#index.xml) + [dir, "#{dir}/#{base}.Data", "#{dir}/#{f}.Data#{@zip}", "#{dir}/#{f}.Meta"] + end +end + +end diff --git a/lib/gorg/cgi-bin/gorg.cgi b/lib/gorg/cgi-bin/gorg.cgi new file mode 100755 index 0000000..3c75dbc --- /dev/null +++ b/lib/gorg/cgi-bin/gorg.cgi @@ -0,0 +1,45 @@ +#! /usr/bin/ruby + +### Copyright 2004, Xavier Neys (neysx@gentoo.org) +# # +# # This file is part of gorg. +# # +# # gorg is free software; you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation; either version 2 of the License, or +# # (at your option) any later version. +# # +# # gorg is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with Foobar; if not, write to the Free Software +### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +require 'cgi' + +require 'gorg/cgi' + +if ARGV.length == 1 and ['-F', '--filter'].include?(ARGV[0]) then + # cgi does not accept any params like gorg, + # Only test on -F or --filter being there and nothing else + do_Filter unless STDIN.tty? +else + # Make CGI's env public to get access to REQUEST_URI and DOCUMENT_ROOT + class CGI + public :env_table + end + + include Gorg + + # Config file is named in env var. GORG_CONF, or possibly REDIRECT_GORG_CONF + ENV["GORG_CONF"] = ENV["GORG_CONF"]||ENV["REDIRECT_GORG_CONF"] + + gorgInit + STDERR.close + + cgi = CGI.new + do_CGI(cgi) +end diff --git a/lib/gorg/cgi-bin/search.cgi b/lib/gorg/cgi-bin/search.cgi new file mode 100755 index 0000000..396001e --- /dev/null +++ b/lib/gorg/cgi-bin/search.cgi @@ -0,0 +1,50 @@ +#! /usr/bin/ruby + +### Copyright 2004, Xavier Neys (neysx@gentoo.org) +# # +# # This file is part of gorg. +# # +# # gorg is free software; you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation; either version 2 of the License, or +# # (at your option) any later version. +# # +# # gorg is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with gorg; if not, write to the Free Software +### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +require 'cgi' +require 'gorg/search' + +# Make CGI's env public to get access to REQUEST_URI and DOCUMENT_ROOT +class CGI + public :env_table +end + +include Gorg + +# Config file is named in env var. GORG_CONF, or possibly REDIRECT_GORG_CONF +# ENV["PATH"] is used as a dirty hackish workaround a limitation of +# webrick's cgi handler: environment variables can't be passed to cgi's +# (REDIRECT_)GORG_CONF should be defined when running cgi's under apache +ENV["GORG_CONF"] = ENV["GORG_CONF"]||ENV["REDIRECT_GORG_CONF"]||ENV["PATH"] + +gorgInit +cgi = CGI.new + +# Params +# +# l = language code, no param will default to en, empty param defaults to any) +# q = query string +# p = page number in search result (0 < p < 1e6) +# s = page size (9 < p < 120) +# b = boolean search (y|Y|1 means yes, anything else no) + +gs = GDig::GSearch.new +gs.do_CGI(cgi) diff --git a/lib/gorg/cgi.rb b/lib/gorg/cgi.rb new file mode 100644 index 0000000..dfe8451 --- /dev/null +++ b/lib/gorg/cgi.rb @@ -0,0 +1,198 @@ +### Copyright 2004, Xavier Neys (neysx@gentoo.org) +# # +# # This file is part of gorg. +# # +# # gorg is free software; you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation; either version 2 of the License, or +# # (at your option) any later version. +# # +# # gorg is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with Foobar; if not, write to the Free Software +### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +# Process CGI request, either from cgi or fcgi + +require "gorg/base" + +module Gorg + def do_Filter(tmout=30, params=nil) + # Read STDIN, transform, spit result out + timeout(tmout) { + # Give it a few seconds to read it all, then timeout + xml = STDIN.read + err, body, filelist = xproc(xml, params, false, true) + if err["xmlErrLevel"] > 0 then + STDERR.puts("#{err.collect{|e|e.join(':')}.join("\n")}") + elsif (body||"").length < 1 then + # Some transforms can yield empty content + STDERR.puts("Empty body") + else + STDOUT.puts(body) + end + } + rescue Timeout::Error, StandardError =>ex + # Just spew it out + STDERR.puts(ex) + end + + def do_CGI(cgi) + header = Hash.new + if cgi.path_info.nil? || cgi.env_table["REQUEST_URI"].index("/#{File.basename($0)}/") + # Sorry, I'm not supposed to be called directly, e.g. /cgi-bin/gorg.cgi/bullshit_from_smartass_skriptbaby + raise Gorg::Status::Forbidden + elsif cgi.request_method == "OPTIONS" + cgi.out('Allow'=>'GET,HEAD'){""} + elsif cgi.request_method == "HEAD" or cgi.request_method == "GET" + # lighttp is b0rked despite what they say :( + # PATH_INFO == "" and PATH_TRANSLATED == nil + if cgi.path_info.length > 0 then + # Apache, or any web browser that works + path_info = cgi.path_info + else + # lighttp, use SCRIPT_NAME instead + path_info = cgi.env_table['SCRIPT_NAME'] + end + query = Hash.new + cgi.params.each{ |p, v| query[p] = v.to_s} + # Get DOCUMENT_ROOT from environment + $Config["root"] = cgi.env_table['DOCUMENT_ROOT'] + + xml_file = cgi.path_translated||(cgi.env_table['DOCUMENT_ROOT']+cgi.env_table['SCRIPT_NAME']) + if not FileTest.file?(xml_file) + # Should have been checked by apache, check anyway + raise Gorg::Status::NotFound + else + # Process request + # Parse If-None-Match and If-Modified-Since request header fields if any + inm=ims=nil + begin + inm = split_header_etags(cgi.env_table['HTTP_IF_NONE_MATCH']) if cgi.env_table['HTTP_IF_NONE_MATCH'] + ims = Time.parse(cgi.env_table['HTTP_IF_MODIFIED_SINCE']) if cgi.env_table['HTTP_IF_MODIFIED_SINCE'] + ims = nil if ims > Time.now # Dates later than current must be ignored + rescue + # Just ignore ill-formated data + nil + end + if $Config['passthru'] && query["passthru"] && query["passthru"] != "0" then + # passthru allowed by config and requested by visitor, return file as text/plain + debug("Passthru granted for #{path_info}") + mstat = File.stat(xml_file) + raise Gorg::Status::NotModified.new(mstat) if notModified?(mstat, inm, ims) + body = IO.read(xml_file) + header['type'] = 'text/plain' + # If client accepts gzip encoding and we support it, return gzipped file + if $Config["zipLevel"] > 0 and ( cgi.accept_encoding =~ /gzip(\s*;\s*q=([0-9\.]+))?/ and ($2||"1") != "0" ) then + body = gzip(body, $Config["zipLevel"]) + header['Content-Encoding'] = "gzip" + header['Vary'] = "Accept-Encoding" + end + else + # Get cookies and add them to the parameters + if $Config["acceptCookies"] then + # Add cookies to our params + query.merge!(cookies_to_params(cgi.cookies)) + end + + if $Config["httphost"] then + # Add HTTP_HOST to stylesheet params + query["httphost"] = if $Config["httphost"][0] == '*' then + cgi.host||"" + elsif $Config["httphost"].include?('*') then + $Config["httphost"][0] + elsif $Config["httphost"].include?(cgi.host) then + $Config["httphost"][0] + else + cgi.host||"" + end + end + + xml_query = query.dup # xml_query==params passed to the XSL, query=>metadata in cache + if $Config["linkParam"] then + xml_query[$Config["linkParam"]] = path_info + end + + bodyZ = nil # Compressed version + body, mstat, extrameta = Cache.hit(path_info, query, inm, ims) + if body.nil? then + # Cache miss, process file and cache result + err, body, filelist, extrameta = xproc(xml_file, xml_query, true) + if err["xmlErrLevel"] > 0 then + raise "#{err.collect{|e|e.join(':')}.join('<br/>')}" + elsif (body||"").length < 1 then + # Some transforms can yield empty content (handbook?part=9&chap=99) + # Consider this a 404 + raise Gorg::Status::NotFound + else + # Cache the output if all was OK + mstat, bodyZ = Cache.store(body, path_info, query, filelist, extrameta) + debug("Cached #{path_info}, mstat=#{mstat.inspect}") + # Check inm & ims again as they might match if another web node had + # previously delivered the same data + if notModified?(mstat, inm, ims) and extrameta.join !~ /set-cookie/i + raise Gorg::Status::NotModified.new(mstat) + end + end + else + if $Config["zipLevel"] > 0 then + bodyZ = body + body = nil + end + end + # If client accepts gzip encoding and we support it, return gzipped file + if bodyZ and $Config["zipLevel"] > 0 and ( cgi.accept_encoding =~ /gzip(\s*;\s*q=([0-9\.]+))?/ and ($2||"1") != "0" ) then + body = bodyZ + header['Content-Encoding'] = "gzip" + header['Vary'] = "Accept-Encoding" + else + unless body then + # We need to unzip bodyZ into body, i.e. we cached zipped data but client does not support gzip + body = gunzip(bodyZ) + end + end + # Add cookies to http header + cookies = makeCookies(extrameta) + if cookies then + header['cookie'] = cookies + end + # Add Content-Type to header + ct = contentType(extrameta) + if ct then + # Turn application/xhtml+xml into text/html if browser does not accept it + if cgi.accept !~ /application\/xhtml\+xml/ and ct =~ /application\/xhtml\+xml(.*)$/ then + header['type'] = "text/html#{$1}" + else + header['type'] = ct + end + else + header['type'] = 'text/plain' + end + end + # Add ETag & Last-Modified http headers + # NB: it's simply mstat(file.xml) when passthru=1 + if mstat then + header['ETag'] = makeETag(mstat) + header['Last-Modified'] = mstat.mtime.httpdate + end + end + cgi.out(header){body} + else # Not a HEAD or GET + raise Gorg::Status::NotAllowed + end + rescue => ex + if ex.respond_to?(:errCode) then + # One of ours (Gorg::Status::HTTPStatus) + cgi.out(ex.header){ex.html} + else + # Some ruby exceptions occurred, make it a 500 + syserr = Gorg::Status::SysError.new + cgi.out('Status'=>syserr.errSts){syserr.html(ex)} + error("do_CGI() failed: #{$!}") + end + end +end diff --git a/lib/gorg/fcgi-bin/gorg.fcgi b/lib/gorg/fcgi-bin/gorg.fcgi new file mode 100755 index 0000000..1f81cf2 --- /dev/null +++ b/lib/gorg/fcgi-bin/gorg.fcgi @@ -0,0 +1,61 @@ +#! /usr/bin/ruby + +### Copyright 2004, Xavier Neys (neysx@gentoo.org) +# # +# # This file is part of gorg. +# # +# # gorg is free software; you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation; either version 2 of the License, or +# # (at your option) any later version. +# # +# # gorg is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with gorg; if not, write to the Free Software +### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +require 'cgi' +require 'fcgi' + +# Overload read_from_cmdline to avoid crashing when request method +# is neither GET/HEAD/POST. Default behaviour is to read input from +# STDIN. Not really useful when your webserver gets OPTIONS / :-( +class CGI + module QueryExtension + def read_from_cmdline + '' + end + end +end + + +require 'gorg/cgi' + +include Gorg + +gorgInit +STDERR.close + +# Should I commit suicide after a while, life can be so boring! +ak47 = $Config["autoKill"]||0 + +countReq = 0; t0 = Time.new +# Process CGI requests sent by the fastCGI engine +FCGI.each_cgi do |cgi| + countReq += 1 + do_CGI(cgi) + # Is it time to leave? + # If maximum number of requests has been exceeded _AND_ at least 1 full minute has gone by + if ak47 > 0 && countReq >= ak47 && Time.new - t0 > 60 then + info("Autokill : #{countReq} requests have been processed in #{Time.new-t0} seconds") + Process.kill("USR1",$$) + else + # Garbage Collect regularly to help keep memory + # footprint low enough without costing too much time. + GC.start if countReq%50==0 + end +end diff --git a/lib/gorg/log.rb b/lib/gorg/log.rb new file mode 100644 index 0000000..4ef05d6 --- /dev/null +++ b/lib/gorg/log.rb @@ -0,0 +1,56 @@ +### Copyright 2004, Xavier Neys (neysx@gentoo.org) +# # +# # This file is part of gorg. +# # +# # gorg is free software; you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation; either version 2 of the License, or +# # (at your option) any later version. +# # +# # gorg is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with gorg; if not, write to the Free Software +### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +# Write logging info for our little gorg + +require 'syslog' +require 'webrick/log' + +module Gorg + # Make log functions available as if we were inside a log instance + # If no $Log global variable has been initialized, do nothing + def fatal(msg) $Log.fatal(msg) if $Log; end + def error(msg) $Log.error(msg) if $Log; end + def warn(msg) $Log.warn(msg) if $Log; end + def info(msg) $Log.info(msg) if $Log; end + def debug(msg) $Log.debug(msg) if $Log; end + + module Log + + class MyLog < WEBrick::BasicLog + # Interface to WEBrick log system + # Not much to add at this time ;-) + end + + class MySyslog + # Interface to syslog + def initialize(appname) + # Open syslog if not already done (only one open is allowed) + @@syslog = Syslog.open(appname) unless defined?(@@syslog) + # Make sure messages get through (WEBrick has its own filter) + @@syslog.mask = Syslog::LOG_UPTO(Syslog::LOG_ERR) + end + + def <<(str) + # WEBrick's logging requires the << method + # Just forward string to syslog + @@syslog.err(str) + end + end + end +end diff --git a/lib/gorg/search.rb b/lib/gorg/search.rb new file mode 100644 index 0000000..c90448a --- /dev/null +++ b/lib/gorg/search.rb @@ -0,0 +1,444 @@ +### Copyright 2004, Xavier Neys (neysx@gentoo.org) +# # +# # This file is part of gorg. +# # +# # gorg is free software; you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation; either version 2 of the License, or +# # (at your option) any later version. +# # +# # gorg is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with Foobar; if not, write to the Free Software +### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +require 'dbi' +require 'yaml' +require 'gorg/base' +require 'cgi' + +module GDig + class GFile + + def initialize(root, f, xlang) + @root = root + @fname = f + @xpath2lang = xlang + end + + def txt + unless @txt then + @txt, @lang = txtifyFile + end + @txt + end + + def lang + unless @lang then + @txt, @lang = txtifyFile + end + @lang + end + + private + + def txtifyFile + x=Gorg::XSL.new + x.xsl = <<EOXSL +<?xml version="1.0" encoding="UTF-8"?> + <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"> + <xsl:output encoding="UTF-8" method="text" indent="no"/> + <xsl:template match="/"> +EOXSL + if (@xpath2lang||"").length > 1 then + x.xsl << <<EOXSL + <xsl:if test="#{@xpath2lang}"> + <xsl:value-of select="concat('%%LANG%%', #{@xpath2lang}, '%%
')"/> + </xsl:if> +EOXSL + end + x.xsl << <<EOXSL + <xsl:apply-templates/> + </xsl:template> + <xsl:template match="*"> + <xsl:apply-templates select="@*"/> + <xsl:apply-templates/> + </xsl:template> + <xsl:template match="@*"> + <xsl:value-of select="concat(' ',.,' ')"/> + </xsl:template> + </xsl:stylesheet> +EOXSL + x.xroot = @root + x.xml = @fname + x.process + + if x.xerr and x.xerr["xmlErrLevel"] >= 3 then + raise x.xerr["xmlErrMsg"] + end + + t = x.xres + if t =~ /^%%LANG%%([^%]+)%%/ then + l = $1 + t = $'.strip + else + l = nil + end + t << @fname + [t.squeeze("\n"), l] + end + end + + class DBFile + attr_reader :fid, :webname + def initialize(dbh, webname, localname) + @dbh = dbh + @webname = webname + @localname = localname + @row = @dbh.select_one("SELECT id,path,lang,timestamp,size FROM files where path = ?", webname) + if @row then + @fid = @row['id'] + else + @fid = nil + end + end + + def DBFile.remove(dbh, fid) + if fid then + dbh.do("delete from files where id=#{fid}") + end + end + + def uptodate? + if @fid then + unless @row then + @row = @dbh.select_one("SELECT id,path,lang,timestamp,size FROM files where id=#{@fid}") + end + if (fstat=File.stat(@localname)) and @row then + @row['timestamp']==fstat.mtime.to_s and @row['size']==fstat.size + else + false + end + end + end + + def update(blob, lang) + fstat=File.stat(@localname) + if @fid then + # update + sql = "update files set lang = ?, txt = ?, timestamp = ?, size = ? where id=#{@fid}" + @dbh.do(sql, lang, blob, fstat.mtime.to_s, fstat.size) + else + # insert new one + sql = "insert into files (path, lang, txt, timestamp, size) values (?, ?, ?, ?, ?)" + @dbh.do(sql, webname, lang, blob, fstat.mtime.to_s, fstat.size) + if id=@dbh.select_one("select last_insert_id()") then + @fid = id[0] + else + @fid = nil + end + end + end + end + + class GSearch + attr_reader :dbh, :searchTxt, :searchResult + include Gorg + + def initialize + @dbh = DBI.connect($Config['dbConnect'], $Config['dbUser'], $Config['dbPassword']) + @dbh['AutoCommit'] = true + end + + def indexDir + wipe = false + scanDir { |webName, localName| + begin + dbf = GDig::DBFile.new(@dbh, webName, localName) + unless dbf.uptodate? then + gf = GFile.new($Config['root'], webName, $Config['xlang']) + blob = gf.txt + lang = gf.lang + if (lang||"").length < 1 then + # No lang attribute, see if we can use the filename + if $Config['flang'] and $Config['flang'].match(webName) then + lang = $Config['flang'].match(webName)[1] + end + end + dbf.update(blob, lang) + wipe = true + debug "#{Time.new.to_i} #{webName} indexed" + end + rescue Exception => e + error "Failed to index #{webName} : #{e.to_s}" + end + } + wipeSearches if wipe + end + + def cleanup + # Remove files from db either because + # they should now be excluded or because they do not exist anymore + wipe = false + @dbh.select_all('select id, path from files') { |row| + if not fileMatch(row[1]) or not File.file?($Config['root']+row[1]) then + DBFile.remove(@dbh, row[0]) + debug "GDig::GSearch: #{row[1]} removed" + wipe = true + end + } + wipeSearches if wipe + end + + def do_CGI(cgi) + $Config["root"] = cgi.env_table['DOCUMENT_ROOT']||$Config["root"] + query = {} + # Get cookies + if $Config["acceptCookies"] then + # Add cookies to our params + query = cookies_to_params(cgi.cookies) + end + # Add URI params that are not used by search engine (p,q,l,s) + cgi.params.each{ |p, v| query[p] = v.to_s} + + # Choose language + if cgi.has_key?("l") then + lang = cgi["l"] + elsif query.has_key?("SL") then + lang = query["SL"] + else + lang = nil + end + + # Perform search + search(cgi["q"], lang) + + if cgi.has_key?("p") and cgi["p"] =~ /^[0-9]{1,5}$/ then + p = cgi["p"].to_i + else + p = 1 + end + + if cgi.has_key?("s") and cgi["s"] =~ /^[0-9]{2,3}$/ then + s = cgi["s"].to_i + elsif query.has_key?("PL") and query["PL"] =~ /^[0-9]{2,3}$/ then + s = query["PL"].to_i + else + s = 20 + end + s = 120 if s > 120 + + xml = xmlResult(p,s) + header = {}; body = "" + if cgi.has_key?("passthru") and $Config["passthru"] then + header = {'type' => 'text/plain'} + body = xml + else + if $Config["linkParam"] then + query[$Config["linkParam"]] = cgi.script_name + end + if $Config["httphost"] then + # Add HTTP_HOST to stylesheet params + query["httphost"] = if $Config["httphost"][0] == '*' then + cgi.host||"" + elsif $Config["httphost"].include?('*') then + $Config["httphost"][0] + elsif $Config["httphost"].include?(cgi.host) then + $Config["httphost"][0] + else + cgi.host + end + end + + err, body, filelist, extra = xproc(xml, query, false) + if err["xmlErrLevel"] > 0 then + raise "#{err.collect{|e|e.join(':')}.join('<br/>')}" + end + cookies = makeCookies(extra) + ct = setContentType(body) + # Turn application/xhtml+xml into text/html if browser does not accept it + if cgi.accept !~ /application\/xhtml\+xml/ and ct =~ /application\/xhtml\+xml(.*)$/ then + header = {'type' => "text/html#{$1}"} + else + header = {'type' => ct} + end + + # Add cookies to http header + if cookies then + header['cookie'] = cookies + end + end + # If client accepts gzip encoding and we support it, return gzipped file + if $Config["zipLevel"] > 0 and ( cgi.accept_encoding =~ /gzip(\s*;\s*q=([0-9\.]+))?/ and ($2||"1") != "0" ) then + body = gzip(body, $Config["zipLevel"]) + header['Content-Encoding'] = "gzip" + header['Vary'] = "Accept-Encoding" + end + cgi.out(header){body} + rescue => ex + syserr = Gorg::Status::SysError.new + cgi.out('Status'=>syserr.errSts){syserr.html(ex)} + error("GSearch::do_CGI() failed: #{$!}") + end + + def search(str, lang) + @searchTxt = str + @searchResult = nil + if (lang||"") == "" then + @searchLang = '%' + else + @searchLang = lang + end + if str =~ /(^|\s)(([+<)(>~-][^+<)(>~-]+)|([^+<)(>~-]+\*))(\s|$)/ then + @searchBool = "Y" + boolClause = "in boolean mode" + else + @searchBool = "N" + boolClause = "" + end + if @searchTxt.length > 0 then + @searchResult = loadSearch + unless @searchResult then + @searchResult = [] + # Perform full text search + sql = <<EOSQL +select id, path, lang, match (txt) against ( ? ) as score +from files +where lang like ? and match (txt) against ( ? #{boolClause} ) +order by score desc +EOSQL + @dbh.select_all(sql, @searchTxt, @searchLang, @searchTxt).each { |r| @searchResult << [r[0],r[1],r[2],r[3]] } + saveSearch + end + end + @searchResult + end + + def xmlResult(page=1, pageLength=25) + # <search page="p" pages="n"> + # <for>search string</for> + # <found link="/path/to/file.xml" lang="fr"> + # blah blah <b>word2</b> bleh + # </found> + pageLength = 20 if pageLength < 1 + xml = "<?xml version='1.0' encoding='UTF-8'?>\n\n" + + if @searchResult and @searchResult.length >= 1 then + removeDeadFiles + nPages = @searchResult.length / pageLength #/ + nPages += 1 unless 0 == @searchResult.length.modulo(pageLength) + page = nPages if page > nPages + page = 1 if page < 1 + + xml << "<search page='#{page}' pages='#{nPages}' pageLength='#{pageLength}' lang='#{xmlEscape(@searchLang)}' bool='#{@searchBool}'>\n" + xml << xmlSearchFor + @searchResult[(page-1)*pageLength..page*pageLength-1].each { |r| + xml << " <found link='#{r[1]}' lang='#{r[2]}' score='#{r[3]}'>\n" + xml << xmlBlobSample(r[0]) << "\n" + xml << " </found>\n" + } + else + xml << "<search page='0' pages='0'>\n" + xml << xmlSearchFor + end + xml << "</search>\n" + end + + def scanDir + Dir.chdir($Config['root']) { + `find -L . -type f`.split("\n").each{ |localFile| + if File.file?(localFile) then + webFile = localFile[1..-1] + if fileMatch(webFile) then + yield [webFile, File.expand_path(localFile)] + end + end + } + } + end + + private + + def xmlBlobSample(fileID) + blob = "" + r = @dbh.select_one("select txt from files where id = #{fileID}") + if r then + blob = r[0] + # Find first matching word and extract some text around it + stxt = @searchTxt.tr('`.,\'"\-_+~<>/?;:[]{}+|\\)(*&^%\$\#@!', ' ').split(' ') + regs = stxt.collect { |w| Regexp.new(w, true, 'U') } + ix = nil + regs.each { |r| break if ix=blob.index(r) } + if ix then + if ix < 80 then + x = 0 + else + x = blob[0,ix-60].rindex(/[ ,\.]/) + x = 0 unless x + end + y = blob.index(/[,\. ]/, ix+80) + y = -1 unless y + blob = xmlEscape(blob[x..y]) + # Mark up sought words + regs.each { |r| blob.gsub!(r){|t| "<b>#{t}</b>"} } + else + x = blob[120..-1].index(/[ ,\.]/) + blob = xmlEscape(blob[0..x]) + end + end + blob + end + + def xmlEscape(str) + if str + str.gsub('&','&').gsub('>','>').gsub('<','<') + else + "w00t" + end + end + + def loadSearch + if @searchTxt then + r = @dbh.select_one("select result from savedsearches where words = ? and lang = ? and bool = ?", @searchTxt, @searchLang, @searchBool) + if r then + YAML::load(r[0]) + end + end + end + + def saveSearch + if @searchTxt then + @dbh.do("delete from savedsearches where words = ? and lang = ? and bool = ?", @searchTxt, @searchLang, @searchBool) + @dbh.do("insert into savedsearches (words, lang, bool, result) values(?, ?, ?, ?)", @searchTxt, @searchLang, @searchBool, @searchResult.to_yaml) + end + end + + def wipeSearches + @dbh.do("delete from savedsearches") + end + + def fileMatch(f) + $Config['in/out'].each { |inout| + return inout[0] if inout[1].match(f) + } + false + end + + def removeDeadFiles + if @searchResult then + @searchResult.reject!{ |r| not File.file?($Config['root']+r[1]) } + end + end + + def xmlSearchFor + " <for>#{xmlEscape(@searchTxt)}</for>\n" if @searchTxt + end + + end + +end diff --git a/lib/gorg/www.rb b/lib/gorg/www.rb new file mode 100644 index 0000000..eb0c8fa --- /dev/null +++ b/lib/gorg/www.rb @@ -0,0 +1,207 @@ +### Copyright 2004, Xavier Neys (neysx@gentoo.org) +# # +# # This file is part of gorg. +# # +# # gorg is free software; you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation; either version 2 of the License, or +# # (at your option) any later version. +# # +# # gorg is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with gorg; if not, write to the Free Software +### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +# Run the stand-alone webserver and serve gentoo.org + +require 'gorg/base' +require 'webrick' +require 'cgi' + +class GentooServlet < WEBrick::HTTPServlet::FileHandler + include Gorg + + def do_GET(req, res) + hit = "#{$Config["root"]}#{req.path}" + cacheName = req.path + if FileTest.directory?(hit) and FileTest.exist?(hit+"/index.xml") then + # Use $URI/index.xml for directories that have an index.xml file + hit << "/index.xml" + cacheName << "/index.xml" + end + hit.squeeze!('/') + cacheName.squeeze!('/') + if FileTest.directory?(hit) then + super # Use default FileHandler for directories that have no index.xml + else + if hit !~ /\.(xml)|(rdf)|(rss)$/ then + super # Use default FileHandler if not an xml file + else + if not FileTest.exist?(hit) then + super # Use default FileHandler to handle 404 (file does not exist) + else + # Parse If-None-Match and If-Modified-Since request header fields if any + ims=inm=nil + begin + ims = Time.parse(req['if-modified-since']) if req['if-modified-since'] + inm = split_header_etags(req['if-none-match']) if req['if-none-match'] + rescue + # Just ignore ill-formated data + nil + end + begin + res['Charset'] = 'UTF-8' + # Process xml file or return xml file if passthru=1 + if $Config['passthru'] && req.query && req.query["passthru"] && req.query["passthru"] != "0" then + # passthru allowed by config and requested by visitor, return file as text/plain + mstat = File.stat(hit) + raise Gorg::Status::NotModified.new(mstat) if notModified?(mstat, inm, ims) + debug("Passthru granted for #{hit}") + body = IO.read(hit) + # If client accepts gzip encoding and we support it, return gzipped file + if $Config["zipLevel"] > 0 and (req.accept_encoding.include?("gzip") or req.accept_encoding.include?("x-gzip")) then + res.body = gzip(body, $Config["zipLevel"]) + res['Content-Encoding'] = "gzip" + res['Vary'] = "Accept-Encoding" + else + res.body = body + end + res['Content-Type'] = 'text/plain' + else + query_params = req.query.dup + # Get cookies and add them to the parameters + if $Config["acceptCookies"] then + # We need CGI:Cookie objects to be compatible with our cgi modules (stupid WEBrick) + ck = req.raw_header.find{|l| l =~ /^cookie: /i} + if ck then + query_params.merge!(cookies_to_params(CGI::Cookie.parse($'.strip))) + debug "query params are " + query_params.inspect + end + end + if $Config["httphost"] then + # Add HTTP_HOST to stylesheet params + query_params["httphost"] = if $Config["httphost"][0] == '*' then + req.host||"" + elsif $Config["httphost"].include?('*') then + $Config["httphost"][0] + elsif $Config["httphost"].include?(req.host) then + $Config["httphost"][0] + else + req.host||"" + end + end + + bodyZ = nil + body, mstat, extrameta = Gorg::Cache.hit(cacheName, query_params, inm, ims) + if body.nil? then + xml_query = query_params.dup + if $Config["linkParam"] then + xml_query[$Config["linkParam"]] = req.path + end + # Cache miss, process file and cache result + err, body, filelist, extrameta = xproc(hit, xml_query, true) + warn("#{err.collect{|e|e.join(':')}.join('; ')}") if err["xmlErrLevel"] == 1 + error("#{err.collect{|e|e.join(':')}.join('; ')}") if err["xmlErrLevel"] > 1 + # Display error message if any, just like the cgi/fcgi versions + raise ("#{err.collect{|e|e.join(':')}.join('<br/>')}") if err["xmlErrLevel"] > 0 + # Cache output + mstat, bodyZ = Gorg::Cache.store(body, cacheName, query_params, filelist, extrameta) + else + if $Config["zipLevel"] > 0 then + bodyZ = body + body = nil + end + end + # If client accepts gzip encoding and we support it, return gzipped file + if bodyZ and $Config["zipLevel"] > 0 and (req.accept_encoding.include?("gzip") or req.accept_encoding.include?("x-gzip")) then + res.body = bodyZ + res['Content-Encoding'] = "gzip" + res['Vary'] = "Accept-Encoding" + else + if body then + res.body = body + else + # We need to unzip bodyZ into body, i.e. we cached zipped data but client does not support gzip + res.body = gunzip(bodyZ) + end + end + # Add cookies to http header + cookies = makeCookies(extrameta) + if cookies then + cookies.each{|c| res.cookies << c.to_s} + end + # Add Content-Type to header + ct = contentType(extrameta).split(';')[0] + if ct then + # Turn application/xhtml+xml into text/html if browser does not accept it + if req.accept.to_s !~ /application\/xhtml\+xml/ and ct =~ /application\/xhtml\+xml(.*)$/ then + res['Content-Type'] = "text/html#{$1}" + else + res['Content-Type'] = ct + end + else + res['Content-Type'] = 'text/plain' + end + end + if mstat then + res['ETag'] = makeETag(mstat) + res['Last-Modified'] = mstat.mtime.httpdate + end + rescue => ex + if ex.respond_to?(:errCode) then + # One of ours (Gorg::Status::HTTPStatus) + res.body = ex.html + res.status = ex.errCode + ex.header.each {|k,v| res[k]=v unless k =~ /status|cookie/i} + else + # Some ruby exceptions occurred, make it a syserr + syserr = Gorg::Status::SysError.new + res.body = syserr.html(ex) + res.status = syserr.errCode + end + end + end + end + end + end +end + +### +#|# Start Here +### + +def www + # Log accesses to either stderr, syslog or a file + if $Config["accessLog"] == "syslog" + # Use syslog again, use our own format based on default but without timestamp + access_log = [ [ @syslog, "HIT %h \"%r\" %s %b" ] ] + STDERR.close + elsif $Config["accessLog"] == "stderr" + # Use syslog again, use our own format based on default but without timestamp + access_log = [ [ STDERR, "HIT %h \"%r\" %s %b" ] ] + else + # Open file and use it, if it's not writable, tough! + access_log_stream = File.open($Config["accessLog"], "a") + access_log = [ [ access_log_stream, WEBrick::AccessLog::COMBINED_LOG_FORMAT ] ] + STDERR.close + end + + s = WEBrick::HTTPServer.new( :BindAddress => $Config["listen"], :AccessLog=>access_log, :Logger => $Log, :Port => $Config["port"], :CGIPathEnv => ENV["GORG_CONF"]) + + # Mount directories + $Config["mounts"].each { |m| + s.mount(m[0], WEBrick::HTTPServlet::FileHandler, m[1]) + } + s.mount("/", GentooServlet, $Config["root"]) + + # Start server + trap("INT"){ s.shutdown } + + puts "\n\nStarting the Gorg web server on #{$Config['listen']}:#{$Config['port']}\n\nHit Ctrl-C or type \"kill #{$$}\" to stop it\n\n" + + s.start +end |