diff options
-rw-r--r-- | COPYING | 340 | ||||
-rw-r--r-- | Changelog | 120 | ||||
-rw-r--r-- | README | 144 | ||||
-rwxr-xr-x | bin/gorg | 100 | ||||
-rw-r--r-- | data/gorg/schema.sql | 19 | ||||
-rw-r--r-- | etc/gorg/gorg.conf.sample | 149 | ||||
-rw-r--r-- | etc/gorg/lighttpd.conf.sample | 118 | ||||
-rw-r--r-- | etc/gorg/vhost.sample | 152 | ||||
-rw-r--r-- | ext/gorg/xsl/MANIFEST | 3 | ||||
-rw-r--r-- | ext/gorg/xsl/extconf.rb | 22 | ||||
-rw-r--r-- | ext/gorg/xsl/xsl.c | 894 | ||||
-rw-r--r-- | ext/gorg/xsl/xsl.h | 44 | ||||
-rw-r--r-- | lib/gorg/base.rb | 602 | ||||
-rw-r--r-- | lib/gorg/cache.rb | 493 | ||||
-rwxr-xr-x | lib/gorg/cgi-bin/gorg.cgi | 45 | ||||
-rwxr-xr-x | lib/gorg/cgi-bin/search.cgi | 50 | ||||
-rw-r--r-- | lib/gorg/cgi.rb | 198 | ||||
-rwxr-xr-x | lib/gorg/fcgi-bin/gorg.fcgi | 61 | ||||
-rw-r--r-- | lib/gorg/log.rb | 56 | ||||
-rw-r--r-- | lib/gorg/search.rb | 444 | ||||
-rw-r--r-- | lib/gorg/www.rb | 207 | ||||
-rw-r--r-- | setup.rb | 1360 |
22 files changed, 5621 insertions, 0 deletions
@@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/Changelog b/Changelog new file mode 100644 index 0000000..4fbb39d --- /dev/null +++ b/Changelog @@ -0,0 +1,120 @@ +2004-12-06 gorg-0.3 initiated + +2004-12-07 http://bugs.gentoo.org/show_bug.cgi?id=73163 from Masatomo Nakano <nakano@gentoo.org> + Use PATH_TRANSLATED to resolve files in ~username + +2004-12-16 Issue Etag and Last-Modified response headers fields + Handle If-Modified-Since and If-None-Match request header fields + Client caching will work in a consistent manner across several web nodes + Set processed document mtime to latest mtime of dependent files and use it for Last-Modified + Use timeouts when accessing cache (clean & store operations) + Fork & Detach when cleaning up the cache + Fix bugs: + . in XSLT C extension when no document root is used + . empty request params prevent use of cache + +====================================================== + +2005-01-10 gorg-0.4 initiated + +2005-01-10 Update Caching: + . Use a dir tree under cache dir that is identical + to the server tree. Some installations seem + to have trouble with 30000+ files in one directory + New param cacheTree = 1 activates this behaviour + . Define MaxFiles param to limit number of files in one dir + . Server will only try to clean the dir it caches to + . Added -C (--clean-cache) option to gorg to clean whole cache tree with a cron job e.g. +2005-01-10 Add filter functionality: + . Added -F (--filter) option to process STDIN + Both gorg and gorg.cgi can be used as a filter. + gorg.cgi needs the -F (--filter) option on its command line + gorg will behave as a filter when no option are used *and* data is piped into it + . Added -W (--web) Starting the web server is the default on the cmd line + but it might need to be told when not started interactively + +====================================================== + +2005-01-10 gorg-0.5 initiated + +2005-04-04 . Trivial code cleanup to get rid of signal catching that was introduced + before issues with mod_fastcgi and ruby-fcgi had been pinpointed. + . wasCash can be set to 0 to disable cache cleaning, use `gorg --clean-cache` to clean up + . xsl.c includes trivial patch that was applied the the -0.4.ebuild for 64-bit systems + . ebuild now knows apache and fastcgi USE flags to depend on apache and fastcgi (mod-fcgid) + +====================================================== + +2005-07-04 gorg-0.6 initiated + +2005-07-04 . Pass parameter with path name of requested file (e.g. $link='/doc/en/index.xml') + . Fix segfault when using a string stylesheet instead of a file + (undocumented free in libxslt lead to a double free) + . Issue http header with text/html, text/xml or application/xhtml+xml as content-type, + replace application/xhtml+xml with text/html if browser does not accept it + . Stand-alone web server (ie. webrick) fails (500) like the (f)cgi versions when lib(xslt|xml2) return a warning + It's just too confusing to have pages that work with "gorg &" generate errors once live with apache or lighttpd + . Make stand-alone web server use index automatically when requesting a dir and index.xml exists + . Add mount points for webrick (eg. for /cgi-bin or /images) to be handled by stock FileHandler (ie. not gorg) + . xsl extension returns messages output with xsl:message that begin with '%%GORG%%' + . Make xsl extension return requested remote file names (ftp:// & http://) and let + caller decide how to handle caching of those resources + . gorg has decided not to cache objects that request foreign URIs + . Allow cookies to be passed to transforms and transforms to set cookies with + xsl:message '%%GORG%%Set-Cookie(cookiename)key=value' + eg. <xsl:message> + <xsl:value-of select="concat('%%GORG%%Set-Cookie(prefs)SL=',$searchLang)"/> + <xsl:message> + NOTE: Setting several cookies will not work properly in some cases with the stand-alone web server + http://rubyforge.org/tracker/?func=detail&aid=2199&group_id=426&atid=1698 + *Update* My patch has been accepted upstream and should make it into an upcoming ruby release + . Allow http redirect (301) with xsl:message '%%GORG%%Redirect=newURI' + . Add charset= to http header for xml files what have encoding="bleh" + . Deprecate zipCache param in favour of zipLevel (0-9) and add support for mod_gzip + zipLevel makes gorg compress cached data *and* returned data if client requests it (accept_encoding=gzip) + Decompressing compressed cached data to have it recompressed on output by apache with mod_deflate is a waste of CPU + Besides, gorg can now be used with web servers that do not support deflating dynamic data (eg. lighttpd) + In short, 0 == gorg does not compress data, neither in its cache nor its output + Of course, you may use mod_deflate or similar to make your web server support gzip encoding + 1-9 == Compress data in cache and return zipped data if client supports it + Make sure your web server does not waste time compressing the data again + (apache looks clever enough and does not recompress as far as I know) + . Pass host value from HTTP header as httphost param to the stylesheet + . Return '<missing file="{filename}"/>' for missing files, no more + Error 500, can be tested by your xsl or just ignored + . Add search engine (uses mysql full text search) + . Documents created with exsl:document under / are created inside + the web site document root. Please note that the current directory + is undefined and that docs created without a leading '/' in their + path land wherever libxml2 decides to write them. + +====================================================== + +2006-05-11 gorg-0.6.1 Maintenance release + . chmod a+x search.cgi + . Fix xsl.c to stat() file and not prepend $DocRoot when file + exists. Apache expands ~username/file.xml to e.g. + /home/username/public_html/file.xml and it should be searched + under htdocs/... + +====================================================== + +2006-06-19 gorg-0.6.2 Maintenance release + . Fix xsl.c to prevent ruby exception on empty bodies + +====================================================== + +2006-09-15 gorg-0.6.3 Maintenance release + . Fix syntax error when looking for searched text and no text is + found. Why MySQL matches those is not clear to me yet. + . Don't bail out when cache dir is unusable. Just warn about it. + . Force httphost to configured value even when no host is passed + +====================================================== + +2009-10-17 gorg-0.6.4 Maintenance release + . Fix Gentoo bug #289391 : missing file triggers cache miss + . Fix bug with stand-alone webserver that returns empty content + when cache store fails, e.g. no cache dir or not writable + . Add Listen parameter to make stand-alone webserver listen on + given address, 127.0.0.1 is the default value @@ -0,0 +1,144 @@ +You can find the latest original documentation at +http://gentoo.neysx.org/mystuff/gorg/gorg.xml + + + +How to install Gorg + +Xavier Neys Author + +1. Introduction + +Gorg allows you to serve your own local copy of http://www.gentoo.org. It can +use either a cgi or a fastcgi script with apache, or even use its own +stand-alone web server. Its name is short for Gentoo.org. + +Of course, it can also be used in any other environment to apply XSL to some XML. + +Gorg has been tested with the following packages: + +Code Listing 1.1: Test environment + +[ebuild R ] net-www/apache-2.0.53 +[ebuild R ] net-www/mod_fcgid-1.05 +[ebuild R ] dev-lang/ruby-1.8.2 +[ebuild R ] dev-libs/fcgi-2.4.0 +[ebuild R ] dev-ruby/ruby-fcgi-0.8.5-r1 +[ebuild R ] dev-libs/libxml2-2.6.18 +[ebuild R ] dev-libs/libxslt-1.1.13 + +Note: At the time of writing, mod_fcgid was not in Portage yet. Please see +http://bugs.gentoo.org/show_bug.cgi?id=79313 for an ebuild + +2. Installing Gorg + +Download the gorg ebuild and drop it into your Portage overlay. + +Compute its digest with ebuild gorg-0.5.ebuild digest. This command will +download the tarball automatically. Finally, run emerge gorg. + +Gorg uses the apache2 and fastcgi USE flags. + +3. Configuring Gorg + +Configuring apache + +You may skip this section if you are not going to use apache at all. + +If you want to use fastcgi, which you should anyway, you'll need to add -D +FCGID to the APACHE2_OPTS variable in /etc/conf.d/apache2. + +Then, integrate the apache configuration directives from the provided sample +vhost configuration file /etc/gorg/vhost.sample into your own apache configuration. +Comments in the sample config file will guide you. + +Configuring Gorg + +Create a copy of the sample config file /etc/gorg/gorg.conf.sample named /etc/ +gorg/gorg.conf and edit it. Comments will help you define your own parameters. + +If you do not want to use the default /etc/gorg/gorg.conf config file, you'll +need to define an environment variable called GORG_CONF that points to the +config file. + +Getting the missing files + +Assuming you'll serve your local copy of CVS, or a copy if, or symlinks to it, +you need to download some files from the dyn directory. + +Code Listing 3.1: Get the missing files + +# cd to your htdocs directory +$ cd /path/to/your/document/root +/htdocs $ mkdir -p dyn/icons +/htdocs $ cd dyn +/htdocs $ wget -O news-index.xml http://www.gentoo.org/dyn/news-index.xml?passthru=1 +# If you also want the icon pages +/htdocs $ wget -O icons.xml http://www.gentoo.org/dyn/icons.xml?passthru=1 +/htdocs $ for FFF in `grep '\.xml"' icons.xml|sed 's:.*\([A-Z2]\{3\}\.xml\).*:\1:'`;do wget -O icons/$FFF http://www.gentoo.org/dyn/icons/$FFF?passthru=1;done +/htdocs $ cd .. +# If you ever need other data from the dyn pages, +# I am sure you get the picture on how to do it. + + +You also need to make the pictures available to your browser. The images +directory is one level above htdocs. Just define a symlink to it and you're +set. + +Code Listing 3.2: Make a symlink to the pictures + +/htdocs $ ln -si ../images images +# It should look like this: +/htdocs $ $ ls -l +drwxr-xr-x 3 neysx users 128 Sep 14 17:45 css +drwxr-xr-x 31 neysx users 744 Oct 26 00:03 doc +drwxr-xr-x 3 neysx users 544 Nov 2 16:53 dtd +drwxr-xr-x 3 neysx users 168 Nov 3 16:24 dyn +-rw-r--r-- 1 neysx users 1406 Jun 7 2003 favicon.ico +lrwxrwxrwx 1 neysx users 10 Oct 21 22:29 images -> ../images/ +-rw-r--r-- 1 neysx users 190 Nov 9 2002 index.xml +drwxr-xr-x 16 neysx users 384 Apr 1 2004 main +drwxr-xr-x 17 neysx users 6960 Nov 3 15:34 news +drwxr-xr-x 8 neysx users 192 Oct 23 14:52 proj +drwxr-xr-x 4 neysx users 96 Sep 17 14:05 security +drwxr-xr-x 3 neysx users 736 Nov 2 16:40 xsl + +# Your local CVS probably shows a few more entries, but at least +# those mentioned above should be available and kept up-to-date. +# Also remember to keep your images directory current. + + +4. Running Gorg + +The stand-alone web server + +The easiest way to try it out is to run gorg. It should display something like: + +Code Listing 4.1: Run Gorg + +$ gorg + + +Starting the Gorg web server on port 8008 + +Hit Ctrl-C or type "kill 31479" to stop it + + +Point your browser to http://localhost:8008 and you should see your favourite +site. + +With apache + +Restart apache (/etc/init.d/apache2 restart) and visit http://localhost +assuming you're installing on your own workstation. + +If it doesn't work, try the stand-alone web server (type gorg). If this doesn't +work either, check your /etc/gorg/gorg.conf config file. If it does work, +please check your apache config files and your logs. Also check that the cache directory +defined in your gorg config file exists and is writable to your apache user. +If all fails, contact me. + + + +The contents of this document are licensed under the Creative Commons - +Attribution / Share Alike license. diff --git a/bin/gorg b/bin/gorg new file mode 100755 index 0000000..4171338 --- /dev/null +++ b/bin/gorg @@ -0,0 +1,100 @@ +#! /usr/bin/ruby + +### Copyright 2004, Xavier Neys (neysx@gentoo.org) +# # +# # This file is part of gorg. +# # +# # gorg is free software; you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation; either version 2 of the License, or +# # (at your option) any later version. +# # +# # gorg is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with Foobar; if not, write to the Free Software +### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +# Nothing much to do at the moment +# Just start the web server using webrick + +# Some cmd line options will be added when more features are added + +# If you are looking for the cgi, it is called gorg.cgi +# The fcgi version is surprisingly called gorg.fcgi +# Just copy it to your cgi-bin directory (or fcgi-bin) and +# set up apache to use it against .xml files + +require 'gorg/base' + +include Gorg +gorgInit + + +def usage + puts <<EOS + +gorg without any option will either start the web server or +behave like a filter if data is piped into it. + +Available options: + +-C, --clean-cache : clean up the whole web cache +-W, --web : explicitely start the web server +-F, --filter : read xml on stdin, process and write result to stdout + NB: relative paths in xml are from current directory + absolute paths are from {root} in config file +-v, --version : display gorg version number +-I, --index : scan and index xml files +--param N V : parameter name and value to be passed to the XSL processor + It can appear more than once + e.g. gorg<file.xml --param N1 V1 --param N2 V2 +EOS +end + + +# Parse cmd-line options + +# Let's do it the easy way until we have more options +if ARGV.length == 1 and ['-W', '--web'].include?(ARGV[0]) then + # Explicit web server requested, do not bother about STDIN + require 'gorg/www' + www +elsif ARGV.length == 1 and ['-C', '--clean-cache'].include?(ARGV[0]) then + # Cache clean up requested, do not bother about STDIN + Cache.washCache($Config["cacheDir"], tmout=900, cleanTree=true) +elsif ARGV.length == 1 and ['-I', '--index'].include?(ARGV[0]) then + require 'gorg/search' + # Index xml files, do not bother about STDIN + gs = GDig::GSearch.new + gs.cleanup # Remove old files + gs.indexDir # Scan for new/modified files +elsif ARGV.include?('-F') or ARGV.include?('--filter') or not STDIN.tty? + # Be a filter by default when data is piped to gorg + # or when -F, --filter is used + + # Scan command line for sequences of '--param paramName paramValue' + params = scanParams(ARGV) + # Only -F or --filter should remain in ARGV + # or nothing at all when piped data is available + if (ARGV.length == 1 and ['-F', '--filter'].include?(ARGV[0])) or (ARGV.length == 0 and not STDIN.tty?) then + require 'gorg/cgi' + do_Filter(300, params) # timeout=5 minutes, default is less + else + usage + end +elsif ARGV.length == 0 and STDIN.tty? + require 'gorg/www' + # No argument & no piped data: run the web server + www +elsif ARGV.length > 1 + usage +elsif ARGV[0] == "-v" or ARGV[0] == "--version" + puts("Gorg version #{Version}") +else + usage +end diff --git a/data/gorg/schema.sql b/data/gorg/schema.sql new file mode 100644 index 0000000..3398c1f --- /dev/null +++ b/data/gorg/schema.sql @@ -0,0 +1,19 @@ +drop table if exists files; +create table files( + id int auto_increment primary key, + path varchar(255) unique, + lang varchar(5), + timestamp varchar(32), + size bigint, + txt mediumtext) CHARACTER SET utf8; +create unique index files_path on files (path(255)); +create index files_lang on files (lang); +create fulltext index files_txt on files (txt); + +drop table if exists savedsearches; +create table savedsearches( + words tinytext, + bool char(1), + lang varchar(5), + result mediumblob); +create index savedsearches_words on savedsearches(lang, words(200)); diff --git a/etc/gorg/gorg.conf.sample b/etc/gorg/gorg.conf.sample new file mode 100644 index 0000000..c3fda72 --- /dev/null +++ b/etc/gorg/gorg.conf.sample @@ -0,0 +1,149 @@ +# /etc/gorg/gorg.conf: Configuration file for Gorg + +# Root dir, typically, your DocumentRoot +# (f)cgi scripts find it in their environment but +# the stand-alone webserver and the search engine need it +root = "/home/neysx/gentoo.org/gentoo/xml/htdocs" + +# Make webrick listen on given IP (IP onlyu, no host name) +listen = 127.0.0.1 + +# Mount paths that are not under the root directory (used by stand-alone web server only) +# eg. to mount /cgi-bin which is usually not under the document root +# Note: Those directories will be handled by the stock FileHandler, ie. not by gorg +mount = /cgi-bin on /home/neysx/gentoo.org/gentoo/xml/cgi-bin +mount = /images on /home/neysx/gentoo.org/gentoo/xml/images + +# Should gorg accept cookies and pass $param=$value to the xsl transform +# Default is no (anything but 1 is no) +acceptCookies = 1 + +# Only read so many lines in xml files to identify stylesheets, use 0 to scan whole file +headXSL = 12 + +# Default stylesheet, relative to root dir +defaultXSL = "/xsl/guide.xsl" + +# Only used by fastCGI, auto exit after given number of requests (0 means no) +# The fcgi process manager will restart a new instance automatically +# NB: it will NOT exit before at least 1 full minute has elapsed even if you set a very low value +# If you want a really short-lived version, use the cgi instead +# mod_fcgid does its own process recycling and this feature will be obsoleted in an later version +autoKill = 5000 + +# Allow return of unprocessed xml file if passthru==(anything but 0) appears in URI params +# 0==No, anything else==Yes +passthru = 1 + +# Pass pathname of requested file in named parameter to the xsl transform +# [a-zA-Z]+ , anything else is ignored and no param is passed +# Default is "link" +linkParam = link + +# Pass a param named httphost to the style sheet (== host value from HTTP header) +# 0 or nothing (default) disables this feature +# * will pass the value as received from the user agent or none (http/1.0) +# name alias1 alias2... will pass name when the value sent by the user agent +# matches exactly any of name alias1 alias2... +# if any alias is *, any value (even nil) will match and name will be passed +# When no value matches, the value received from the user agent is passed +#httphost = mysite www.mysite.org mysite.org alias.mysite.org + +# Cache directory. Directory must exist and be writable by whoever runs the server (e.g. apache) +# It must also be writable by the user who runs the stand-alone web server, i.e. not the apache user +# if you want to use both web servers. You can even run both at the same time. +# Default is no cache +cacheDir = "/var/cache/gorg" + +# Number of seconds after which a document is considered too old, 0=never +# Can be used to force a refresh or to stress-test the system +#cacheTTL = 86400 # 1 day +cacheTTL = 864000 # or 10 days +#cacheTTL = 600 # or 10 minutes.... + +# Use a tree of directories under cacheDir that matches the site tree +# Use when your system has problems coping with a huge single cache dir +# 0 means no tree (all files in cacheDir) and is the default +# If you use this, make sure you clean up the cache with gorg -C regularly +cacheTree = 1 + +# Max size of cache in megabytes +# Please note that cacheSize is used ONLY when cleaning up either +# when cacheTree==0 and a clean-up is started based on cacheWash (see below) +# or when cacheTree!=0 and `gorg -C` is run +cacheSize = 250 + +# Max number of files in a given cache directory +# Please note that this limit is also enforced when cacheTree == 0 +# in which case it means the max total number of files in the whole cache +maxFiles = 2000 + +# Support gzip http encoding (ie. mod_deflate) +# 0 means no compression *and* no support for gzip encoding. +# 1-9 gives compression level, 1 least compressed, 9 max compressed +# Cached pages use the same compression level +# Default is 2 +zipLevel = 2 + +# Clean cache automatically and regularly when a store into the cache occurs. +# gorg cleans up if random(value) < 10, i.e. +# Set to 0 to disable and rely on gorg --clean-cache being run regularly +# a value<=10 means at every call, +# 100 means 10 percent of stores will also clean the cache +# 1000 means 10 permille (cacheSize will be checked only once every 100 stores) +# Note: gorg only tries to clean the dir it caches to, not the whole cache tree +# Use `gorg -C` or `gorg --clean-cache` to clean up whole cache +cacheWash = 0 + +# Level of logging that goes to syslog +# OFF, FATAL, ERROR, WARN, INFO, DEBUG = 0, 1, 2, 3, 4, 5 +logLevel = 4 + +# +# Used only by stand-alone webserver +# + +# Send hit stats to syslog/stderr/a file +accessLog = "syslog" + +# Listen on port (must be >1023 to be run by non-root) +port = 8008 + +# +# Search engine parameters +# + +# Connect string, only mysql is supported at the moment +dbConnect = DBI:mysql:DB_NAME:HOST_NAME +dbUser = USENAME +dbPassword = PASSWORD + +# Document language can be guessed from the document itself with +# an XPath expression. It should return the language code. +# Only the first 5 characters will be used. +# For instance, to use a root element's lang attribute: +xpath_to_lang = /*[1]/@lang + +# If no XPath is given or no lang is found, you can use the file path as a fallback: +# define a regexp to apply to the file path, $1 must yield the language +# For instance, the following one applied to '/doc/en/file.xml' returns 'en' +fpath_to_lang = ^/[^/]+/([^/]+)/.*xml$ + +# include/exclude directives will be processed in the order they appear below. +# First match will be used to either include or exclude the file. +# If no match is found, file is skipped +# Each directive should define one and only one regexp +# Beware, regexp are not shell globs, .xml means any character followed by xml anywhere in the file name +# .+\.xml$ means one or more characters followed by a dot and ending with xml +# Any file that can't be processed, ie. because it is not well-formed will not be indexed + +exclude = ^/proj/en/gdp/tests/ +exclude = /CVS/ +exclude = ^/xsl/ +exclude = /draft/ +exclude = ^/doc/.+/handbook/2004 +exclude = metadoc\.xml$ +exclude = /inserts-.+\.xml$ +exclude = ^/dyn/ +exclude = herds/pkgList.xml +include = ^/.+\.xml$ diff --git a/etc/gorg/lighttpd.conf.sample b/etc/gorg/lighttpd.conf.sample new file mode 100644 index 0000000..ad0d932 --- /dev/null +++ b/etc/gorg/lighttpd.conf.sample @@ -0,0 +1,118 @@ +# Example lighttpd.conf for Gorg + +var.basedir = "/var/www/www.gentoo.org" +var.logdir = "/var/log/lighttpd" +var.statedir = "/var/lib/lighttpd" +var.cache = "/var/cache/gorg" + +server.modules = ( +# "mod_rewrite", +# "mod_redirect", + "mod_alias", + "mod_access", +# "mod_status", +# "mod_setenv", +# "mod_userdir", + "mod_compress", + "mod_accesslog" +) + +include "mime-types.conf" + +server.username = "lighttpd" +server.groupname = "lighttpd" + +server.document-root = var.basedir + "/htdocs" +server.pid-file = "/var/run/lighttpd.pid" + +server.errorlog = var.logdir + "/error.log" +accesslog.filename = var.logdir + "/access.log" +accesslog.format = "%h %t \"%r\" %b %s %T \"%{User-Agent}i\" \"%{Referer}i\" %I %O" + +server.indexfiles = ("index.xml", "index.html" ) + +server.tag = "Lightppd 1.4.8/Gentoo 2006.0" + +server.follow-symlink = "enable" + +server.port = 80 + +server.errorfile-prefix = var.basedir + "/errors/" + +static-file.exclude-extensions = (".cgi", ".fcgi") + +dir-listing.activate = "disable" +dir-listing.hide-dotfiles = "enable" +dir-listing.exclude = ("^\.", "~$", "\.\.") + +url.access-deny = ("~", ".inc") + +# userdir.path = "public_html" +# userdir.exclude-user = ("root") + +# ssl.engine = "enable" +# ssl.pemfile = "server.pem" + +# status.status-url = "/server-status" +# status.config-url = "/server-config" + +compress.cache-dir = var.cache +compress.filetype = ("text/plain", "text/html", "text/css") + +$HTTP["url"] =~ "^/xsl/" { + url.access-deny = ( ".xsl" ) +} + +$HTTP["url"] =~ "^/dyn/doc-snapshots/" { + dir-listing.activate = "enable" +} + +# url.rewrite = ( +# "^/$" => "/server-status" +# ) + +# url.redirect = ( +# "^/wishlist/(.+)" => "http://www.123.org/$1" +# ) + +server.modules += ("mod_cgi") +alias.url = ( "/cgi-bin/" => var.basedir + "/cgi-bin/" +) + +$HTTP["url"] =~ "^/cgi-bin/" { + # disable directory listings + dir-listing.activate = "disable" + # only allow cgi's in this directory + cgi.assign = ( ".cgi" => "" ) +} + +server.modules += ("mod_fastcgi") +fastcgi.debug = 0 +fastcgi.server = ( + ".xml" => + ( "localhost" => + ( + "host" => "127.0.0.1", + "port" => 1026, + "bin-path" => "/var/www/www.gentoo.org/fcgi-bin/gorg.fcgi", + "idle-timeout" => 60, + "broken-scriptfilename" => "enable", + "bin-environment" => ( "GORG_CONF" => "/etc/gorg/gorg.conf" ), + "max-procs" => 5, + "min-procs" => 1 + ) + ), + ".rss" => + ( "localhost" => + ( + "host" => "127.0.0.1", + "port" => 1026, + "bin-path" => "/var/www/www.gentoo.org/fcgi-bin/gorg.fcgi", + "idle-timeout" => 60, + "broken-scriptfilename" => "enable", + "bin-environment" => ( "GORG_CONF" => "/etc/gorg/gorg.conf" ), + "max-procs" => 5, + "min-procs" => 1 + ) + ) + ) diff --git a/etc/gorg/vhost.sample b/etc/gorg/vhost.sample new file mode 100644 index 0000000..2dce594 --- /dev/null +++ b/etc/gorg/vhost.sample @@ -0,0 +1,152 @@ +<VirtualHost www.mygentoo.org> +ServerName www.mygentoo.org +ServerAlias mygentoo +DocumentRoot /var/www/www.mygentoo.org/htdocs +ServerAdmin webman@mygentoo.org +LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"" common +CustomLog "| /usr/sbin/cronolog /var/log/apache/www.mygentoo.org/%Y%m%d.%Z.www.mygentoo.org.access-log" common +ErrorLog "| /usr/sbin/cronolog /var/log/apache/www.mygentoo.org/%Y%m%d.%Z.www.mygentoo.org.error-log" +AddDefaultCharset utf-8 + +<Directory /var/www/www.mygentoo.org/htdocs> + DirectoryIndex index.xml + Options +Includes -Indexes + AllowOverride All + Order Allow,Deny + Allow from all + + # cgi mode + #Action gorg /cgi-bin/gorg.cgi + + # fast cgi mode + Action gorg /fcgi-bin/gorg.fcgi + + # process .xml and .rdf files through gorg + AddHandler gorg .xml + AddHandler gorg .rdf +</Directory> + +ScriptAlias /cgi-bin/ /var/www/www.mygentoo.org/cgi-bin/ +Alias /fcgi-bin/ /var/www/www.mygentoo.org/fcgi-bin/ + +# handle images and icons +Alias /images /var/www/www.mygentoo.org/images +Alias /icons /var/www/www.mygentoo.org/icons + +<Directory /var/www/www.mygentoo.org/images> + AllowOverride None + Order allow,deny + Allow from all +</Directory> + +<Directory /var/www/www.mygentoo.org/icons> + AllowOverride None + Order allow,deny + Allow from all +</Directory> + +<Directory /var/www/www.mygentoo.org/fcgi-bin> + AllowOverride All + Options ExecCGI + Order allow,deny + Allow from all + # You should use mod_fcgid, but just in case you insist on using mod_fastcgi: + #<IfModule mod_fastcgi.c> + # SetHandler fastcgi-script + #</IfModule> + <IfModule mod_fcgid.c> + SetHandler fcgid-script + </IfModule> +</Directory> + +<Directory /var/www/www.mygentoo.org/cgi-bin> + AllowOverride All + Options ExecCGI + Order allow,deny + Allow from all +</Directory> + +</VirtualHost> + + +# You should use mod_fcgid, but just in case you insist on using mod_fastcgi: +#<IfModule mod_fastcgi.c> +# FastCgiServer /var/www/gorg/fcgi-bin/gorg.fcgi -processes 3 -restart-delay 1 -idle-timeout 300 -appConnTimeout 240 +#</IfModule> + +<IfModule mod_fcgid.c> + IdleTimeout 120 + #IdleTimeout n (300 seconds) + #An idle fastcgi application will be terminated after IdleTimeout seconds. + + IdleScanInterval 30 + #IdleScanInterval n (120 seconds) + #The scan interval for idle fastcgi applications. + + BusyTimeout 360 + #BusyTimeout n (300 seconds) + #a fastcgi application will be terminated if handing a single request longer than busy timeout. + + BusyScanInterval 60 + #BusyScanInterval n (120 seconds) + #The scan interval for busy timeout fastcgi applications. + + ErrorScanInterval 5 + #ErrorScanInterval n (3 seconds) + #The scan interval for exit pending fastcgi applications. + #fastcgi applications will be terminated within this scanning. + + ZombieScanInterval 5 + #ZombieScanInterval n (3 seconds) + #The scan interval for zombie process. + + ProcessLifeTime 3000 + #ProcessLifeTime n (3600 seconds) + #A fastcgi application will be terminated if lifetime expired, even no error is detected. + + SocketPath /var/run + #SocketPath path (logs/fcgidsock) + #The directory to put the UNIX domain socket. (UNIX only) + + SpawnScoreUpLimit 24 + #SpawnScoreUpLimit n (10) + + #The spawn-speed control score up water limit. + #Score increases while a process is spawned or terminated, and decreases as time progresses; + #while the score is higher than SpawnScoreUpLimit, the spawning will be held for a while. + #The higher this number is, the higher speed of the spawning can be. + + SpawnScore 3 + #SpawnScore n (1) + #The weight of spawning. This weight will be plused to the spawn-control score on every spawn. + #The higher this number is, the lower speed of spawning can be. + + TerminationScore 1 + #TerminationScore n (2) + #The weight of termination. This weight will be plused to the score while fastcgi process terminates. + #The higher this number is, the lower speed of spawning can be. + + MaxProcessCount 16 + #MaxProcessCount n (1000) + #The max count of total fastcgi process count. + + DefaultMaxClassProcessCount 8 + #DefaultMaxClassProcessCount n (100) + #The maximum number of fastcgi application instances + #allowed to run for any one fastcgi application. + + DefaultInitEnv GORG_CONF /etc/gorg/gorg.conf + #The default environment variables before a fastcgi application is spawned. You can set this configuration more than once. + + IPCConnectTimeout 10 + #IPCConnectTimeout n (2 seconds) + #The connect timeout to a fastcgi application. + + IPCCommTimeout 90 + #IPCCommTimeout n (5 seconds) + #The communication timeout to a fastcgi application. + + OutputBufferSize 0 + #OutputBufferSize n (64k bytes) + #CGI output cache buffer size. +</IfModule> diff --git a/ext/gorg/xsl/MANIFEST b/ext/gorg/xsl/MANIFEST new file mode 100644 index 0000000..29fb55d --- /dev/null +++ b/ext/gorg/xsl/MANIFEST @@ -0,0 +1,3 @@ +extconf.rb +xsl.c +xsl.h diff --git a/ext/gorg/xsl/extconf.rb b/ext/gorg/xsl/extconf.rb new file mode 100644 index 0000000..1bd115b --- /dev/null +++ b/ext/gorg/xsl/extconf.rb @@ -0,0 +1,22 @@ +require "mkmf"
+
+unless have_library("xml2", "xmlRegisterDefaultInputCallbacks")
+ puts("libxml2 not found")
+ exit(1)
+end
+
+unless have_library('xslt','xsltParseStylesheetFile')
+ puts("libxslt not found")
+ exit(1)
+end
+
+unless have_library('exslt','exsltRegisterAll')
+ puts("libexslt not found")
+ exit(1)
+end
+
+$LDFLAGS << ' ' << `xslt-config --libs`.chomp
+
+$CFLAGS << ' ' << `xslt-config --cflags`.chomp
+
+create_makefile("gorg/xsl")
diff --git a/ext/gorg/xsl/xsl.c b/ext/gorg/xsl/xsl.c new file mode 100644 index 0000000..d8d40b6 --- /dev/null +++ b/ext/gorg/xsl/xsl.c @@ -0,0 +1,894 @@ +/* + Copyright 2004, Xavier Neys (neysx@gentoo.org) + + This file is part of gorg. + + gorg is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + gorg is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with gorg; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include "xsl.h" + +/* + * Copied from xmlIO.c from libxml2 + */ +static int xmlFileWrite (void * context, const char * buffer, int len) +{ + int items; + + if ((context == NULL) || (buffer == NULL)) + return(-1); + items = fwrite(&buffer[0], len, 1, (FILE *) context); + if ((items == 0) && (ferror((FILE *) context))) { + //xmlIOErr(0, "fwrite()"); + __xmlIOErr(XML_FROM_IO, 0, "fwrite() failed"); + return(-1); + } + return(items * len); +} + +extern int xmlLoadExtDtdDefaultValue; +static int xmlOptions = XSLT_PARSE_OPTIONS | XML_PARSE_NOWARNING; + +/*Enum xmlParserOption { + XML_PARSE_RECOVER = 1 : recover on errors + XML_PARSE_NOENT = 2 : substitute entities + XML_PARSE_DTDLOAD = 4 : load the external subset + XML_PARSE_DTDATTR = 8 : default DTD attributes + XML_PARSE_DTDVALID = 16 : validate with the DTD + XML_PARSE_NOERROR = 32 : suppress error reports + XML_PARSE_NOWARNING = 64 : suppress warning reports + XML_PARSE_PEDANTIC = 128 : pedantic error reporting + XML_PARSE_NOBLANKS = 256 : remove blank nodes + XML_PARSE_SAX1 = 512 : use the SAX1 interface internally + XML_PARSE_XINCLUDE = 1024 : Implement XInclude substitition + XML_PARSE_NONET = 2048 : Forbid network access + XML_PARSE_NODICT = 4096 : Do not reuse the context dictionnary + XML_PARSE_NSCLEAN = 8192 : remove redundant namespaces declarations + XML_PARSE_NOCDATA = 16384 : merge CDATA as text nodes + XML_PARSE_NOXINCNODE = 32768 : do not generate XINCLUDE START/END nodes +}*/ + +/* + * Library global values that need to be accessed by the callbacks + * Make sure the lib init routine registers them with ruby's GC + */ +VALUE g_xroot=Qnil; +VALUE g_xfiles=Qnil; +VALUE g_xmsg=Qnil; +VALUE g_mutex=Qnil; +VALUE g_xtrack=Qnil; // true/false, no need to register this one + +/* + * Store ID's of ruby methodes to speed up calls to rb_funcall* + * so that we do not have to call rb_intern("methodName") repeatedly. + */ +struct { + int include; + int to_a; + int to_s; + int length; + int synchronize; +} id; + + +/* + * Add file to list of requested files, if not already in our array + */ +void addTrackedFile(char *f, const char *rw) +{ + VALUE rbNewPath; + VALUE rwo; + VALUE rbNewEntry; + + if (Qtrue == g_xtrack) + { + switch(*rw) + { + case 'R': + case 'r': + rwo = rb_str_new2("r"); + break; + case 'W': + case 'w': + rwo = rb_str_new2("w"); + break; + default: + rwo = rb_str_new2("o"); + } + rbNewPath = rb_str_new2(f); + rbNewEntry = rb_ary_new(); + rb_ary_push(rbNewEntry, rwo); + rb_ary_push(rbNewEntry, rbNewPath); + if (Qtrue != rb_funcall(g_xfiles, id.include, 1, rbNewEntry)) + rb_ary_push(g_xfiles, rbNewEntry); + } +} + +/* + * libxml2 File I/O Match Callback : + * return 1 if we must handle the file ourselves + */ +int XRootMatch(const char * URI) { + int r = 0; +//printf("NSX-RootMatch: %s\n",URI); + if ( URI != NULL && (*URI == '/' || !strncmp(URI, "file:///", 8))) + r = 1; + else + if (!strncmp(URI, "ftp://", 6) || !strncmp(URI, "http://", 7)) + // Add URI to list of requested files to let caller know remote files are used + addTrackedFile((char *)URI, "o"); + + return r; +} + + +/* + * libxml2 File I/O Open Callback : + * open the file, prepend $xroot if necessary and add file to list of requested files on input + */ +void *XRootOpen (const char *filename, const char* rw) { + char *path = NULL; + char *fakexml = NULL; + FILE *fd; + char *rbxrootPtr=""; + int rbxrootLen=0; + char empty[] = "<?xml version='1.0'?><missing file='%s'/>"; + int pip[2]; + struct stat notused; + +//printf("NSX-RootOpen: %s\n", filename); + + if (filename == NULL || (*filename != '/' && strncmp(filename, "file:///", 8))){ + return NULL; // I told you before, I can't help you with that file ;-) + } + + if (g_xroot != Qnil) + { + rbxrootPtr = RSTRING(g_xroot)->ptr; + rbxrootLen = RSTRING(g_xroot)->len; + } + path = (char *) malloc((strlen(filename) + rbxrootLen + 1) * sizeof(char)); + if (path == NULL) + return NULL; + + if (!strncmp(filename, "file:///", 8)) + { + // Absolute path, do not prepend xroot, e.g. file:///etc/xml/catalog + strcpy ( path, filename+7); + } + else + { + // If requested file is already under xroot, do not prepend path with xroot + // Example: + // Say we have xroot="/htdocs" + // when calling document('../xml/file.xml') in /htdocs/xsl/mysheet.xsl, + // the lib will already have replaced the .. with /htdocs + // and there is no need to add /htdocs + // On the other hand, if we call document('/xml/file.xml') in /htdocs/xsl/mysheet.xsl, + // because we know our root is /htdocs, then we need to prepend xroot to get /htdocs/xml/file.xml + // The consequence of that is that /${DocRoot}/${DocRoot}/whatever is not usable. Get over it. + // + // Besides, it is also possible that a file is located outside the $DocumentRoot, e.g. ~usename/file.xml + // that apache would have expanded to /home/username/public_html/file.xml e.g. + if (rbxrootLen && strncmp(rbxrootPtr, filename, rbxrootLen) && stat(filename,¬used)) + { + // Requested file is not already under $DocRoot, prepend it + strcpy (path, rbxrootPtr); + strcat (path, filename); + } + else + { + // Use the filename that was requested as-is + strcpy(path, filename); + } + } + + // Add file to list of requested files + addTrackedFile(path, rw); + + fd = fopen(path, rw); + free(path); + + if (*rw == 'r' && fd == NULL && strncmp(filename, "file:///", 8) && strlen(filename)>4 && strncmp((strlen(filename)-4)+filename, ".dtd", 4) && strncmp((strlen(filename)-4)+filename, ".xsl", 4)) + // Return fake xml + // We don't know for sure that libxml2 wants an xml file from a document(), + // but what the heck, let's just pretend + if (pipe(pip)) + return (void *) NULL; + else + { + fakexml = (char *) malloc((strlen(filename) + sizeof(empty)) * sizeof(char)); + if (path == NULL) + return NULL; + sprintf(fakexml, empty, filename); + write(pip[1], fakexml, strlen(fakexml)); + close(pip[1]); + free(fakexml); + return (void *) fdopen(pip[0], "r"); + } + else + return (void *) fd; +} + +int XRootClose (void * context) { + if (context == (void *) -1) + return 0; + else + return xmlFileClose(context); +} + +void *XRootInputOpen (const char *filename) { + return XRootOpen (filename, "r"); +} + +void *XRootOutputOpen (const char *filename) { + return XRootOpen (filename, "w"); +} + + +/* + * Intercept xsl:message output strings, + * If one starts with "%%GORG%%" then it to our @xmsg array. + * If not, pass it to the default generic handler of libxslt + */ +void xslMessageHandler(void *ctx ATTRIBUTE_UNUSED, const char *msg, ...) +{ + va_list args; + char *str; + int len; + + va_start(args, msg); + len = vasprintf(&str, msg, args); + va_end(args); + + if (len > 0) + { + if (!strncmp(str, "%%GORG%%", 8)) + { + if (len > 8) + { + rb_ary_push(g_xmsg, rb_str_new2(str+8)); + } + } + else + { + // Not for gorg, spit it out on stderr as libxslt would do + fputs(str, stderr); + } + // Need to free pointer that was allocated by vasprintf + free(str); + } +} + + +/* + * Try to distinguish between a filename and some xml + * without accessing the filesystem or parsing the string as xml + * + * If the string is long (>FILENAME_MAX) or + * starts with "<?xml" or "<?xsl" or + * contains newline chars, + * we assume it is some kind of xml, otherwise we assume it is a filename + */ +int looksLikeXML(VALUE v) +{ + return (RSTRING(v)->len > FILENAME_MAX) + || (!strncmp(RSTRING(v)->ptr, "<?xml", 5)) + || (!strncmp(RSTRING(v)->ptr, "<?xsl", 5)) + || (strstr(RSTRING(v)->ptr, "\n")); +// We could also try with " " but some are stupid enough to use spaces in filenames +} + +// I got stumped and needed this ;-) +void dumpCleanup(char * str, struct S_cleanup c) +{ +printf( "%s\n" + "\nparams=%08x" + "\ndocxml=%08x" + "\ndocxsl=%08x" + "\ndocres=%08x" + "\n xsl=%08x" + "\ndocstr=%08x" + "\n=======================\n", str, c.params, c.docxml, c.docxsl, c.docres, c.xsl, c.docstr); +} + +/* + * my_raise : cleanup and raise ruby exception + * + * cleanup frees xsl docs and allocated memory, pointers are in passed struct + * then raises the passed exception + * + * struct of pointers can be NULL (no memory to free) and + * exception can be NULL (clean up only, do not call rb_raise) + * + * Set last error level and last error message if applicable and available + */ +void my_raise(VALUE obj, s_cleanup *clean, VALUE rbExcep, char *err) +{ + xmlErrorPtr xmlErr = NULL; + VALUE hErr; + + if (!NIL_P(obj)) + { + xmlErr = xmlGetLastError(); + hErr = rb_hash_new(); + if (xmlErr) + { + // It seems we usually get a \n at the end of the msg, get rid of it + if (*(xmlErr->message+strlen(xmlErr->message)-1) == '\n') + *(xmlErr->message+strlen(xmlErr->message)-1) = '\0'; + // Build hash with error level, code and message + rb_hash_aset(hErr, rb_str_new2("xmlErrCode"), INT2FIX(xmlErr->code)); + rb_hash_aset(hErr, rb_str_new2("xmlErrLevel"), INT2FIX(xmlErr->level)); + rb_hash_aset(hErr, rb_str_new2("xmlErrMsg"), rb_str_new2(xmlErr->message)); + } + else + { + // Build hash with only an error code of 0 + rb_hash_aset(hErr, rb_str_new2("xmlErrCode"), INT2FIX(0)); + rb_hash_aset(hErr, rb_str_new2("xmlErrLevel"), INT2FIX(0)); + } + rb_iv_set(obj, "@xerr", hErr); + } + + if (clean) + { + //dumpCleanup("Freeing pointers", *clean); + free(clean->params); + xmlFree(clean->docstr); + xmlFreeDoc(clean->docres); + xmlFreeDoc(clean->docxml); + //xmlFreeDoc(clean->docxsl); segfault /\/ Veillard said xsltFreeStylesheet(xsl) does it + xsltFreeStylesheet(clean->xsl); + } + // Clean up xml stuff + xmlCleanupInputCallbacks(); + xmlCleanupOutputCallbacks(); + xmlResetError(xmlErr); + xmlResetLastError(); + xsltCleanupGlobals(); + xmlCleanupParser(); + xsltSetGenericErrorFunc(NULL, NULL); + + // Reset global variables to let ruby's GC do its work + g_xroot = Qnil; + g_xfiles = Qnil; + g_xmsg = Qnil; + + // Raise exception if requested to + if (rbExcep != Qnil) + { + rb_raise(rbExcep, err); + } +} + + +/* + * Register input callback with libxml2 + * + * We need to repeat this call because libxml cleanup unregisters and we like cleaning up + */ +void my_register_xml(void) +{ + // Enable exslt + exsltRegisterAll(); + + // Register default callbacks, e.g.http:// + xmlRegisterDefaultInputCallbacks(); + xmlRegisterDefaultOutputCallbacks(); + +/* NO NEED xmlRegisterInputCallbacks(xmlIOHTTPMatch, xmlIOHTTPOpen, xmlIOHTTPRead, xmlIOHTTPClose); +xmlRegisterInputCallbacks(xmlFileMatch, xmlFileOpen, xmlFileRead, xmlFileClose);*/ + + // Add our own file input callback + if (xmlRegisterInputCallbacks(XRootMatch, XRootInputOpen, xmlFileRead, XRootClose) < 0) + { + rb_raise(rb_eSystemCallError, "Failed to register input callbacks"); + } + + // Add our own file output callback to support exslt:document + if (xmlRegisterOutputCallbacks(XRootMatch, XRootOutputOpen, xmlFileWrite, xmlFileClose) < 0) + { + rb_raise(rb_eSystemCallError, "Failed to register output callbacks"); + } + // Add our own xsl:message handler + xsltSetGenericErrorFunc(NULL, xslMessageHandler); + + xsltDebugSetDefaultTrace(XSLT_TRACE_NONE); + xmlSubstituteEntitiesDefault(1); + xmlLoadExtDtdDefaultValue=1; +} + + +/* + * Check that parameters are usable, i.e. like + * [p1, v1] : single parameter + * [[p1, v1], [p2, v2]...] : several pairs of (param name, value) + * {p1=>v1...} : a hash of (param name, value) + * nil : no parameter + * + * Raise an exceptiom if not happy or return the list of params as + * [[p1, v1], [p2, v2]...] + */ +VALUE check_params(VALUE xparams) +{ + VALUE retparams=Qnil; + + if (!NIL_P(xparams)) + { + VALUE ary; + VALUE param; + int len, plen; + int i; + + // Reject some single values straight away + switch (TYPE(xparams)) + { + case T_FLOAT: + case T_REGEXP: + case T_FIXNUM: + case T_BIGNUM: + case T_STRUCT: + case T_FILE: + case T_TRUE: + case T_FALSE: + case T_DATA: + case T_SYMBOL: + rb_raise(rb_eTypeError, "Invalid parameters"); + return Qnil; + } + // if xparams is not an array, try to make one + ary = rb_funcall(xparams, id.to_a, 0); + + // Now check that our array is a suitable array: + // empty array => Qnil + // array.length==2, could be 2 params [[p1,v1],[p2,v2]] or 1 param [p,v] + // if both items are arrays, we have a list of params, otherwise we have a single param + len = RARRAY(ary)->len; + switch (len) + { + case 0: + retparams = Qnil; + break; + case 2: + // fall through to default if we have 2 arrays, otherwise, we must have 2 strings + if (! (TYPE(rb_ary_entry(ary,0))==T_ARRAY && TYPE(rb_ary_entry(ary,1))==T_ARRAY)) + { + VALUE s1 = rb_funcall(rb_ary_entry(ary,0), id.to_s, 0); + VALUE s2 = rb_funcall(rb_ary_entry(ary,1), id.to_s, 0); + + // Both items must be strings + retparams = rb_ary_new3(2L, s1, s2); + break; + } + default: + // scan array and check that each item is an array of 2 strings + retparams = rb_ary_new(); + for (i=0; i < len; ++i) + { + if ( TYPE(rb_ary_entry(ary,i)) != T_ARRAY ) + { + rb_raise(rb_eTypeError, "Invalid parameters"); + return Qnil; + } + param = rb_ary_entry(ary,i); + plen = NUM2INT(rb_funcall(param, id.length, 0)); + if ( plen != 2 ) + { + rb_raise(rb_eTypeError, "Invalid parameters"); + return Qnil; + } + VALUE s1 = rb_funcall(rb_ary_entry(param,0), id.to_s, 0); + VALUE s2 = rb_funcall(rb_ary_entry(param,1), id.to_s, 0); + + rb_ary_push(retparams, rb_ary_new3(2L, s1, s2)); + } + } + } + return retparams; +} + + +/* + * Build array of pointers to strings + * + * return NULL or pointer + */ +char *build_params(VALUE rbparams) +{ + char *ret; + char **paramPtr; + char *paramData; + int i; + VALUE tempval; + VALUE tempstr; + char quotingChar; + + if (rbparams == Qnil) + // You shoud not call this if you have no params, see it as an error + return NULL; + + // Compute total block size in one go + tempval = rb_funcall(rbparams, id.to_s, 0); + ret = malloc ( ((RARRAY(rbparams)->len)*2+1) * sizeof(void *) // Two pointers per [param, value] + 1 NULL + + (RARRAY(rbparams)->len) * 4 * sizeof(char) // Quotes around values + 1 NULL per value + + (RSTRING(tempval)->len) * sizeof(char) // Size of param names & values + ); + if ( ret==NULL) + return NULL; // out of memory + + paramPtr = (char **)ret; + paramData = ret + ((RARRAY(rbparams)->len)*2+1) * sizeof(void *); + // Copy each param name & value + for (i=0; i<RARRAY(rbparams)->len; ++i) + { + tempval = rb_ary_entry(rbparams, i); // ith param, i.e. [name, value] + + // 1. Add param name + + tempstr = rb_ary_entry(tempval, 0); // param name + // Add param name address to list of pointers + *paramPtr++ = paramData; + // Copy param name into data block + strcpy(paramData, RSTRING(tempstr)->ptr); + // Move data pointer after inserted string + paramData += 1+ RSTRING(tempstr)->len; + + // 2. Copy param value, quoting it with ' or " + + tempstr = rb_ary_entry(tempval, 1); // param value + // Don't bother if param is a mix of ' and ", users should know better :-) + // or it's been checked already. Here we expect params to be OK. + quotingChar = '"'; + if ( strchr(RSTRING(tempstr)->ptr, quotingChar) ) + quotingChar = '\''; // Use ' instead of " + + // Add para value address in list of pointers + *paramPtr++ = paramData; + + // Start with quoting character + *paramData++ = quotingChar; + // Copy value + strcpy(paramData, RSTRING(tempstr)->ptr); + // Move data pointer after inserted string + paramData += RSTRING(tempstr)->len; + // Close quote + *paramData++ = quotingChar; + // End string with \0 + *paramData++ = '\0'; + } + // Terminate list of pointers with a NULL + *paramPtr = NULL; + + return ret; +} + + + + +/* + * Parse stylesheet and xml document, apply stylesheet and return result + */ +VALUE xsl_process_real(VALUE none, VALUE self) +{ + s_cleanup myPointers; + int docstrlen; + + VALUE rbxml, rbxsl, rbout, rbparams, rbxroot; + + // Get instance data in a reliable format + rbxml = rb_iv_get(self, "@xml"); + if (NIL_P(rbxml)) + rb_raise(rb_eArgError, "No XML data"); + rbxml = StringValue(rbxml); + if (!RSTRING(rbxml)->len) + rb_raise(rb_eArgError, "No XML data"); + rbxsl = rb_iv_get(self, "@xsl"); + if (NIL_P(rbxsl)) + rb_raise(rb_eArgError, "No Stylesheet"); + rbxsl = StringValue(rbxsl); + if (!RSTRING(rbxsl)->len) + rb_raise(rb_eArgError, "No Stylesheet"); + rbxroot = rb_iv_get(self, "@xroot"); + rbparams = check_params(rb_iv_get(self, "@xparams")); + + // Initialize our globals + if (!NIL_P(rbxroot)) + g_xroot = StringValue(rbxroot); + g_xtrack = RTEST(rb_iv_get(self, "@xtrack")) ? Qtrue : Qfalse; + g_xfiles = rb_ary_new(); + g_xmsg = rb_ary_new(); + + // Register callbacks and stuff + my_register_xml(); + + // Make sure our pointers are all NULL + memset(&myPointers, '\0', sizeof(myPointers)); + + // Build param array + if (rbparams != Qnil) + if (NULL==(myPointers.params=build_params(rbparams))) + my_raise(self, &myPointers, rb_eNoMemError, "Cannot allocate parameter block"); + + // Parse XSL + if (looksLikeXML(rbxsl)) + { + myPointers.docxsl = xmlParseMemory(RSTRING(rbxsl)->ptr, RSTRING(rbxsl)->len); +// myPointers.docxsl = xmlReadMemory(RSTRING(rbxsl)->ptr, RSTRING(rbxsl)->len, ".", NULL, 0); + if (myPointers.docxsl == NULL) + { + my_raise(self, &myPointers, rb_eSystemCallError, "XSL parsing error"); + return Qnil; + } + myPointers.xsl = xsltParseStylesheetDoc(myPointers.docxsl); + if (myPointers.xsl == NULL) + { + my_raise(self, &myPointers, rb_eSystemCallError, "XSL stylesheet parsing error"); + return Qnil; + } + } + else // xsl is a filename + { + myPointers.xsl = xsltParseStylesheetFile(RSTRING(rbxsl)->ptr); + if (myPointers.xsl == NULL) + { + my_raise(self, &myPointers, rb_eSystemCallError, "XSL file loading error"); + return Qnil; + } + } + + // Parse XML + if (looksLikeXML(rbxml)) + { + myPointers.docxml = xmlReadMemory(RSTRING(rbxml)->ptr, RSTRING(rbxml)->len, ".", NULL, xmlOptions); + if (myPointers.docxml == NULL) + { + my_raise(self, &myPointers, rb_eSystemCallError, "XML parsing error"); + return Qnil; + } + } + else // xml is a filename + { + myPointers.docxml = xmlReadFile(RSTRING(rbxml)->ptr, NULL, xmlOptions); + if (myPointers.docxml == NULL) + { + my_raise(self, &myPointers, rb_eSystemCallError, "XML file parsing error"); + return Qnil; + } + } + + // Apply stylesheet to xml + myPointers.docres = xsltApplyStylesheet(myPointers.xsl, myPointers.docxml, (void*)myPointers.params); + if (myPointers.docres == NULL) + { + my_raise(self, &myPointers, rb_eSystemCallError, "Stylesheet apply error"); + return Qnil; + } + + xsltSaveResultToString(&(myPointers.docstr), &docstrlen, myPointers.docres, myPointers.xsl); + if ( docstrlen >= 1 ) + rbout = rb_str_new2((char*)(myPointers.docstr)); + else + rbout = Qnil; + rb_iv_set(self, "@xres", rbout); + rb_iv_set(self, "@xfiles", g_xfiles); + rb_iv_set(self, "@xmsg", g_xmsg); + + // Clean up, no exception to raise + my_raise(self, &myPointers, Qnil, NULL); + return rbout; +} + +// Use g_mutex to make sure our callbacks do not mess up the globals +// if the user is running several transforms in parallel threads +static VALUE in_sync(VALUE self) +{ + return rb_funcall(self, id.synchronize, 0); +} + +VALUE xsl_process(VALUE self) +{ + rb_iterate(in_sync, g_mutex, xsl_process_real, self); +} + +/* + * @xerr + */ +VALUE xsl_xerr_get( VALUE self ) +{ + return rb_iv_get(self, "@xerr"); +} + +/* + * @xres + */ +VALUE xsl_xres_get( VALUE self ) +{ + return rb_iv_get(self, "@xres"); +} + +/* + * @xmsg + */ +VALUE xsl_xmsg_get( VALUE self ) +{ + return rb_iv_get(self, "@xmsg"); +} + +/* + * @xfiles + */ +VALUE xsl_xfiles_get( VALUE self ) +{ + return rb_iv_get(self, "@xfiles"); +} + +/* + * @xparams + */ +VALUE xsl_xparams_set( VALUE self, VALUE xparams ) +{ + // Check params and raise an exception if not happy + check_params(xparams); + // Store parameters + return rb_iv_set(self, "@xparams", xparams); +} + +VALUE xsl_xparams_get( VALUE self ) +{ + return rb_iv_get(self, "@xparams"); +} + +/* + * @xroot + */ +VALUE xsl_xroot_set( VALUE self, VALUE xroot ) +{ + // Throw an exception if xroot cannot be used as a string + if (!NIL_P(xroot)) StringValue(xroot); + // Store param in @xroot + rb_iv_set(self, "@xroot", xroot); + + return xroot; +} + +VALUE xsl_xroot_get( VALUE self ) +{ + return rb_iv_get(self, "@xroot"); +} + +/* + * @xtrack + */ +VALUE xsl_xtrack_set( VALUE self, VALUE xtrack ) +{ + // @xtrack is true if param is neither Qnil nor QFalse + rb_iv_set(self, "@xtrack", RTEST(xtrack) ? Qtrue : Qfalse); + + return xtrack; +} + +VALUE xsl_xtrack_get( VALUE self ) +{ + return rb_iv_get(self, "@xtrack"); +} + +/* + * @xml + */ +VALUE xsl_xml_set( VALUE self, VALUE xml ) +{ + // Throw an exception if xml cannot be used as a string + if (!NIL_P(xml)) StringValue(xml); + // Store param in @xml + rb_iv_set(self, "@xml", xml); + + return xml; +} + +VALUE xsl_xml_get( VALUE self ) +{ + return rb_iv_get(self, "@xml"); +} + +/* + * @xsl + */ +VALUE xsl_xsl_set( VALUE self, VALUE xsl ) +{ + // Throw an exception if xsl cannot be used as a string + if (!NIL_P(xsl)) StringValue(xsl); + // Store param in @xsl + rb_iv_set(self, "@xsl", xsl); + + return xsl; +} + +VALUE xsl_xsl_get( VALUE self ) +{ + return rb_iv_get(self, "@xsl"); +} + + +static VALUE xsl_init(VALUE self) +{ + rb_iv_set(self, "@xml", Qnil); + rb_iv_set(self, "@xsl", Qnil); + rb_iv_set(self, "@xfiles", Qnil); + rb_iv_set(self, "@xmsg", Qnil); + rb_iv_set(self, "@xparams", Qnil); + rb_iv_set(self, "@xroot", Qnil); + rb_iv_set(self, "@xtrack", Qfalse); + rb_iv_set(self, "@xerr", Qnil); + + return self; +} + + +VALUE mGorg; +VALUE cXSL; + +/* + * Library Initialization + */ +void Init_xsl( void ) +{ + mGorg = rb_define_module( "Gorg" ); + cXSL = rb_define_class_under( mGorg, "XSL", rb_cObject ); + + // Get our lib global mutex + rb_require("thread"); + g_mutex = rb_eval_string("Mutex.new"); + + // Get method ID's + id.include = rb_intern("include?"); + id.to_a = rb_intern("to_a"); + id.to_s = rb_intern("to_s"); + id.length = rb_intern("length"); + id.synchronize = rb_intern("synchronize"); + + // Register lib global variables with ruby's GC + rb_global_variable(&g_mutex); + rb_global_variable(&g_xfiles); + rb_global_variable(&g_xmsg); + rb_global_variable(&g_xroot); + + rb_define_const( cXSL, "ENGINE_VERSION", rb_str_new2(xsltEngineVersion) ); + rb_define_const( cXSL, "LIBXSLT_VERSION", INT2NUM(xsltLibxsltVersion) ); + rb_define_const( cXSL, "LIBXML_VERSION", INT2NUM(xsltLibxmlVersion) ); + rb_define_const( cXSL, "XSLT_NAMESPACE", rb_str_new2(XSLT_NAMESPACE) ); + rb_define_const( cXSL, "DEFAULT_VENDOR", rb_str_new2(XSLT_DEFAULT_VENDOR) ); + rb_define_const( cXSL, "DEFAULT_VERSION", rb_str_new2(XSLT_DEFAULT_VERSION) ); + rb_define_const( cXSL, "DEFAULT_URL", rb_str_new2(XSLT_DEFAULT_URL) ); + rb_define_const( cXSL, "NAMESPACE_LIBXSLT", rb_str_new2(XSLT_LIBXSLT_NAMESPACE) ); + + rb_define_method( cXSL, "initialize", xsl_init, 0 ); + + rb_define_method( cXSL, "xmsg", xsl_xmsg_get, 0 ); // Return array of '%%GORG%%.*' strings returned by the XSL transform with <xsl:message> + rb_define_method( cXSL, "xfiles", xsl_xfiles_get, 0 ); // Return array of names of all files that libxml2 opened during last process + rb_define_method( cXSL, "xparams", xsl_xparams_get, 0 ); // Return hash of params + rb_define_method( cXSL, "xparams=", xsl_xparams_set, 1 ); // Set hash of params to pass to the xslt processor {"name" => "value"...} + rb_define_method( cXSL, "xroot", xsl_xroot_get, 0 ); // Root dir where we should look for files with absolute path + rb_define_method( cXSL, "xroot=", xsl_xroot_set, 1 ); // See the root dir as a $DocumentRoot + rb_define_method( cXSL, "xtrack?", xsl_xtrack_get, 0 ); // Should I track the files that libxml2 opens + rb_define_method( cXSL, "xtrack=", xsl_xtrack_set, 1 ); // Track the files that libxml2 opens, or not + rb_define_method( cXSL, "xml", xsl_xml_get, 0 ); + rb_define_method( cXSL, "xml=", xsl_xml_set, 1 ); + rb_define_method( cXSL, "xsl", xsl_xsl_get, 0 ); + rb_define_method( cXSL, "xsl=", xsl_xsl_set, 1 ); + rb_define_method( cXSL, "xerr", xsl_xerr_get, 0 ); + rb_define_method( cXSL, "xres", xsl_xres_get, 0 ); + rb_define_method( cXSL, "process", xsl_process, 0 ); +} diff --git a/ext/gorg/xsl/xsl.h b/ext/gorg/xsl/xsl.h new file mode 100644 index 0000000..0fa9588 --- /dev/null +++ b/ext/gorg/xsl/xsl.h @@ -0,0 +1,44 @@ +/* + Copyright 2004, Xavier Neys (neysx@gentoo.org) + + This file is part of gorg. + + gorg is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + gorg is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Foobar; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef __XSL_H__ +#define __XSL_H__ + +#include <sys/stat.h> +#include <ruby.h> +#include <libxslt/xslt.h> +#include <libexslt/exslt.h> +#include <libxslt/xsltInternals.h> +#include <libxslt/extra.h> +#include <libxslt/xsltutils.h> +#include <libxslt/transform.h> + +typedef struct S_cleanup +{ + char *params; + xmlDocPtr docxml, docxsl, docres; + xsltStylesheetPtr xsl; + xmlChar *docstr; +} +s_cleanup; + +#define XSL_VERSION "0.1" + +#endif diff --git a/lib/gorg/base.rb b/lib/gorg/base.rb new file mode 100644 index 0000000..c3851a9 --- /dev/null +++ b/lib/gorg/base.rb @@ -0,0 +1,602 @@ +### Copyright 2004, Xavier Neys (neysx@gentoo.org) +# # +# # This file is part of gorg. +# # +# # gorg is free software; you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation; either version 2 of the License, or +# # (at your option) any later version. +# # +# # gorg is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with Foobar; if not, write to the Free Software +### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +module Gorg + Version = "0.6" +end + +# Some required stuff for gorg +require 'time' + +require 'gorg/xsl' +require 'gorg/log' +require 'gorg/cache' +require 'timeout' +require 'cgi' +require 'stringio' +require 'zlib' +require 'ipaddr' + + +module Gorg + + def xproc(path, params, list=false, printredirect=false) + # Process file through xslt passing params to the processor + # path should be the absolute path of the file, i.e. not relative to DocumentRoot + # + # Since 0.4, path can also be a string containing + # the actual xml to be processed + # + # Use default stylesheet if none can be found in the file + # Return a list of files read by the processor (useful to do caching) if requested + # + # Return an error condition and, hopefully, some useful output + # Do not raise any exception + # In most cases, an error will result in no output but + # the xslt processor can consider some errors as warnings and + # return the best result it could come up with along with a warning + # e.g. if a file used in a document() function cannot be found, + # the xslt processor will return some output and a warning. + # It's up to the caller to decide whether to use the output or b0rk + # + # The return value is an array of 2 to 4 items: [{}, "", [[]], []] + # 1. hash with error information, its keys are + # 1.a "xmlErrCode" 0 is no error, -9999 means an exception has been raised in this block (unlikely), + # anything else is an error code (see /usr/include/libxml2/libxml/xmlerror.h) + # 1.b "xmlErrLevel" again, from libxml2, 0==OK, 1==Warning, 2==Error, 3==Fatal + # 1.c "xmlErrLevel" again, from libxml2, some explanation about what went wrong + # 2. output from xsltprocessor (or error message from a raised exception) + # 3. list of files that the xslt processor accessed if the list was requested, + # paths are absolute, i.e. not relative to your docroot. + # Each entry is an array [access type, path] with access_type being + # "r" for read, "w" for written (with exsl:document) or "o" for other (ftp:// or http://) + # 4. array of CGI::Cookie to be sent back + # + # Examples: [{"xmlErrMsg"=>"blah warning blah", "xmlErrCode"=>1509, "xmlErrLevel"=>1}, "This is the best XSLT could do!", nil] + # [{"xmlErrCode"=>0}, "Result of XSLT processing. Well done!", ["/etc/xml/catalog","/var/www/localhost/htdocs/doc/en/index.xml","/var/www/localhost/htdocs/dtd/guide.dtd"]] + + xsltproc = Gorg::XSL.new + xsltproc.xroot = $Config["root"] + # Grab strings from xsl:message + xslMessages = [] + # Does the caller want a list of accessed files? + xsltproc.xtrack = list; filelist = Array.new + # Process .xml file with stylesheet(s) specified in file, or with default stylesheet + xsltproc.xml = path + # Look for stylesheet href (there can be more than one) + regexp = Regexp.new('<\?xml-stylesheet.*href="([^"]*)".*') + l = $Config["headXSL"] ; styles = Array.new + if FileTest.file?(path) then + # Path is indeed a file name + IO.foreach(path) { |line| + styles << $1 if regexp.match(line) + break if (l-=1) == 0 + } + else + # Scan xml for stylesheet names + path.each { |line| styles << $1 if regexp.match(line) } + end + # Use default stylesheet if none were found in the doc + styles << $Config["defaultXSL"] if styles.length == 0 + # Add params, we expect a hash of {param name => param value,...} + xsltproc.xparams = params + # Process through list of stylesheets + firstErr = {} + while xsltproc.xsl = styles.shift + xsltproc.process + filelist += xsltproc.xfiles if xsltproc.xtrack? + # Break and raise 301 on redirects + xsltproc.xmsg.each { |r| + if r =~ /Redirect=(.+)/ then + if printredirect then + STDERR.puts "Location: #{$1}" + else + raise Gorg::Status::MovedPermanently.new($1) + end + end + } + xslMessages += xsltproc.xmsg + # Remember 1st warning / error + firstErr = xsltproc.xerr if firstErr["xmlErrLevel"].nil? && xsltproc.xerr["xmlErrLevel"] > 0 + # B0rk on error, an exception should have been raised by the lib, but, er, well, you never know + break if xsltproc.xerr["xmlErrLevel"] > 1 + xsltproc.xml = xsltproc.xres + end + # Keep 1st warning / error if there has been one + firstErr = xsltproc.xerr if firstErr["xmlErrLevel"].nil? + # Return values + [ firstErr, xsltproc.xres, (filelist.uniq if xsltproc.xtrack?), xslMessages ] + rescue => ex + if ex.respond_to?(:errCode) then + # One of ours (Gorg::Status::HTTPStatus) + # Propagate exception + raise + else + debug "in xproc exception handler: #{ex.inspect} // #{xsltproc.xerr.inspect}" + # Return exception message and an error hash as expected from the xslt processor + # Use error codes that the xslt lib might have returned + [ if (xsltproc.xerr["xmlErrCode"]||-1) == 0 then + { "xmlErrMsg" => ex.to_s, + "xmlErrCode" => 9999, + "xmlErrLevel" => 3 + } + else + { "xmlErrMsg" => xsltproc.xerr["xmlErrMsg"] || ex.to_s, + "xmlErrCode" => xsltproc.xerr["xmlErrCode"], + "xmlErrLevel" => xsltproc.xerr["xmlErrLevel"] + } + end , + ex.to_s, + (filelist.uniq if xsltproc.xtrack?) + ] + end + end + + # HTTP status codes and html output + module Status + class HTTPStatus < StandardError + def html(err="") + <<-EOR +<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN"> +<HTML> +<HEAD><TITLE>#{errSts}</TITLE></HEAD> +<BODY> +<H1>#{errLabel}</H1> +<font color="#FF0000">#{err}</font> +<HR> +</BODY> +</HTML> + EOR + end + def errSts + "#{errCode} #{errLabel}" + end + # Default is unknown error + def errLabel + "Undefined Error" + end + def errCode + 999 + end + def header + {'Status' => errSts} + end + end + + class NotModified < HTTPStatus + def initialize(stat) + # 304 needs to send ETag and Last-Modified back + @mstat=stat + end + def header + {'Last-Modified' => @mstat.mtime.httpdate.dup, 'ETag' => makeETag(@mstat).dup}.merge(super) + end + def html + "" + end + def errLabel + "Not Modified" + end + def errCode + 304 + end + end + + class MovedPermanently < HTTPStatus + def initialize(loc) + # 301 needs to send Location: + @location=loc + end + def errLabel + "Moved Permanently" + end + def errCode + 301 + end + def header + {'Location' => @location}.merge(super) + end + def html + # RFC says "should" not "must" add a body + "" + end + def html301 # Not used + <<-EO301 +<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN"> +<html><head> +<title>301 Moved Permanently</title> +</head><body> +<h1>Moved Permanently</h1> +<p>The document has moved <a href="#{@location}">here</a>.</p> +</body></html> + EO301 + end + end + + class Forbidden < HTTPStatus + def errLabel + "Forbidden" + end + def errCode + 403 + end + end + + class NotFound < HTTPStatus + def errLabel + "Not Found" + end + def errCode + 404 + end + end + + class NotAllowed < HTTPStatus + def errLabel + "Method Not Allowed" + end + def header + {'Allow'=>'GET,HEAD'}.merge(super) + end + def errCode + 405 + end + end + + class SysError < HTTPStatus + def errLabel + "Internal Server Error" + end + def errCode + 500 + end + end + end #Status module + + + def gorgInit + # Initialize gorg, i.e. read config file, init cache, ... + # Simply build a hash of params => value in a global variable called $Config + + # Set up default values + $Config = { "AppName" => "gorg", # Used for syslog entries, please keep 'gorg' (cannot be changed in config file) + "root" => nil, # No root dir by default (cgi uses DOCUMENT_ROOT from its environment) + "port" => 8000, # Used for stand-alone web server (WEBrick) + "headXSL" => 12, # Only read 12 lines in xml files to identify required stylesheets + "defaultXSL" => nil, # No default stylesheet, how could I guess? + "cacheDir" => nil, # No cache by default. Directory must exist and be writable. + "cacheTTL" => 0, # Number of seconds after which a document is considered too old, 0=never + "cacheSize" => 40, # in MegaBytes, max size of cache, used when autocleanig + "zipLevel" => 2, # Compresion level used for gzip support (HTTP accept_encoding) (0-9, 0=none, 9=max) + "maxFiles" => 9999, # Max number of files in a single directory in the cache tree + "cacheTree" => 0, # Use same tree as on site in cache, 0 = disabled + "cacheWash" => 0, # Clean cache automatically and regularly when a store into the cache occurs. 0 = disabled + # gorg cleans up if random(param_value) < 10. It will only clean same dir it caches to, not whole tree. + # i.e. a value<=10 means at every call (not a good idea), 100 means once/10 stores, 1000 means once/100 stores + "logLevel" => 4, # INFO, be slightly verbose by default (messages go to syslog) OFF, FATAL, ERROR, WARN, INFO, DEBUG = 0, 1, 2, 3, 4, 5 + "passthru" => true, # Allow return of requested file without processing it if passthru="anything but 0" is passed + "acceptCookies" =>false,# Allow cookies in & out of transforms + "linkParam" => "link", # Pass pathname of requested file in 'link' param to xsl transform + "HTTP_HOST" => nil, # Pass host value from HTTP header to xsl transform + "accessLog" => "syslog",# or a filename or STDERR, used to report hits from WEBrick, not used by cgi's + "autoKill" => 0, # Only used by fastCGI, exit after so many requests (0 means no, <=1000 means 1000). Just in case you fear memory leaks. + "in/out" => [], # (In/Ex)clude files from indexing + "mounts" => [], # Extran mounts for stand-alone server + "listen" => "127.0.0.1" # Let webrick listen on given IP + } + # Always open syslog + @syslog = Gorg::Log::MySyslog.new($Config["AppName"]) + $Log = Gorg::Log::MyLog.new(@syslog, 5) # Start with max + + # Check for config file + configf = ENV["GORG_CONF"]||"/etc/gorg/gorg.conf" + raise "Cannot find config file (#{configf})" unless FileTest.file?(configf) and FileTest.readable?(configf) + file = IO.read(configf) + parseConfig($Config, file) + + # Init cache + Cache.init($Config) if $Config["cacheDir"] + + # Set requested log level + $Log.level = $Config["logLevel"] + rescue + error("Gorg::init failed: #{$!}") + STDERR.puts("Gorg::init failed: #{$!}") + exit(1) + end + + def scanParams(argv) + # Scan argv for --param paramName paramValue sequences + # params are removed from argv + # Return a hash of {"name" => "value"} + h = Hash.new + while idx = argv.index('--param') + break if argv.length <= idx+2 # We need at least 2 more args after --param + argv.delete_at(idx) # Remove --param from argv + name = argv.delete_at(idx) # Remove param name from argv + value = argv.delete_at(idx) # Remove param value from argv + h[name] = value # Add entry in result + end + + h if h.length > 0 + end + + private + def parseConfig(h, config) + config.each {|line| + line.strip! + next if line.length == 0 or line[0,1] == '#' # Skip blank lines and comments + raise "Invalid Configuration (#{line})" unless line =~ /^([a-zA-Z_]*)\s*=\s*/ + param = $1 + value = $' + # If value starts with ' or ", it ends with a similar sign and does not accept any in the value, no escaping... We keep it simple + # otherwise, it ends with EOL or first space + if value =~ /["'](.*)['"]/ then + value = $1 + end + value.strip! + raise "No value for #{param}" unless value.length > 0 + # Check param / value (only syntactical checks here) + case param.downcase + when "root" + h["root"] = value + when "port" + h["port"] = value.to_i + when "passthru" + h["passthru"] = value.squeeze != "0" + when "acceptcookies" + h["acceptCookies"] = value.squeeze == "1" + when "linkparam" + if value =~ /^\s*([a-zA-Z]+)\s*$/ then + h["linkParam"] = $1 + else + h["linkParam"] = nil + end + when "httphost" + hosts = value.squeeze(" ") + case hosts + when /^0?$/ + hh = nil + when "*" + hh = ["*"] + else + hh = hosts.split(" ") + # Add IPs + hosts.split(" ").each { |ho| + begin + hh += TCPSocket.gethostbyname(ho)[3..-1] if ho != '*' + rescue + # Ignore + nil + end + } + hh.uniq! + end + h["httphost"] = hh + when "headxsl" + h["headXSL"] = value.to_i + when "defaultxsl" + h["defaultXSL"] = value + when "cachedir" + h["cacheDir"] = value + when "cachettl" + h["cacheTTL"] = value.to_i + when "cachesize" + h["cacheSize"] = value.to_i + when "maxfiles" + h["maxFiles"] = value.to_i + when "cachetree" + h["cacheTree"] = value.squeeze != "0" + when "ziplevel" + if value =~ /^\s*([0-9])\s*$/ then + h["zipLevel"] = $1.to_i + else + h["zipLevel"] = 2 + end + when "cachewash" + h["cacheWash"] = value.to_i + when "loglevel" + h["logLevel"] = value.to_i + when "accesslog" + h["accessLog"] = value + when "autokill" + h["autoKill"] = value.to_i + when "listen" + begin + ip = IPAddr.new(value) + h["listen"] = ip.to_s + rescue + h["listen"] = "127.0.0.1" + end + when "dbconnect" + h["dbConnect"] = value + when "dbuser" + h["dbUser"] = value + when "dbpassword" + h["dbPassword"] = value + when "exclude" + h["in/out"] << [false, Regexp.new(value)] + when "include" + h["in/out"] << [true, Regexp.new(value)] + when "fpath_to_lang" + h["flang"] = Regexp.new(value) + when "xpath_to_lang" + h["xlang"] = value + when "mount" + if value =~ /^([^\s]+)\s+ON\s+(.+)$/i then + h["mounts"] << [$1, $2] + end + else + raise "Unknown parameter (#{param})" + end + } + rescue + raise "Could not parse config file: #{$!}" + end + + # Utilities + def contentType(aMsg) + # Find the Content-Type=xxx/yyy line in aMsg + # from the Meta file in the cache + ct = nil + aMsg.each { |s| + if s =~ /^Content-Type:(.+)$/ then + ct = $1 + break + end + } + ct + end + + def setContentType(data) + # Set content-type according to x(ht)ml headers + charset = nil + if data =~ /^<\?xml .*encoding=['"](.+)['"]/i then + charset = $1 if $1 + # XML / XHTML + if data[0..250] =~ /^<\!DOCTYPE\s+html/i then + # XHTML + ct = 'application/xhtml+xml' + else + # XML + ct = 'text/xml' + end + if charset then + ct << "; charset=#{charset}" + end + elsif data =~ /^<\!DOCTYPE\s+html\sPUBLIC\s(.+DTD XHTML)?/i then + # (X)HTML + if $1 then + # XHTML + ct = 'application/xhtml+xml' + else + # HTML + ct = 'text/html' + end + elsif data =~ /<html/i then + # HTML + ct = 'text/html' + else + # TXT + ct = 'text/plain' + end + ct + end + + def makeCookies(aMsg) + # Make an array of CGI::Cookie objects + # msg is expected to be an array of strings like 'Set-Cookie(name)value=param' + # (output by the xsl transform with xsl:message) + cookies = Hash.new + aMsg.each { |s| + if s =~ /^Set-Cookie\(([^\)]+)\)([a-zA-Z0-9_-]+)=(.+)$/ then + # $1 = cookie name $2 = key $3 = value + if cookies.has_key?($1) then + cookies[$1] << "#{$2}=#{$3}" + else + cookies[$1] = ["#{$2}=#{$3}"] + end + end + } + if cookies.length > 0 then + # Make CGI::Cookie objects + cookies.map { |k,v| + CGI::Cookie.new('name' => k, 'value' => v, 'expires' => Time.now + 3600*24*30) + } + else + nil + end + end + + def cookies_to_params(cookies) + # Turn array of CGI::Cookie objects into a Hash of key=>value + # cookies is a hash, forget the keys, + # each value should be an array of strings, each string should be like 'param=value' + h = {} + cookies.values.each { |v| + if v.class==String and v =~ /^([a-zA-Z0-9_-]+)=(.+)$/ then + h[$1] = $2 + elsif v.class==Array then + v.each { |vv| + if vv.class==String and vv =~ /^([a-zA-Z0-9_-]+)=(.+)$/ then + h[$1] = $2 + end + } + elsif v.class==CGI::Cookie then + v.value.each { |vv| + if vv.class==String and vv =~ /^([a-zA-Z0-9_-]+)=(.+)$/ then + h[$1] = $2 + end + } + end + } + h + rescue + error "Could not parse cookies (#{$!}) " + {} + end + + def notModified?(fstat, etags, ifmodsince) + # Decide whether file has been modified according to either etag, last mod timestamp or both + # If both If-None-Match and If-Modified-Since request header fields are present, + # they have to be tested both + res = false + if fstat then + a = etags.to_a + if ifmodsince && etags then + res = (ifmodsince >= fstat.mtime) && (a.include?(makeETag(fstat)) || a.include?('*')) + elsif etags + res = a.include?(makeETag(fstat)) || a.include?('*') + elsif ifmodsince + res = ifmodsince >= fstat.mtime + end + end + # Return result + res + end + + def split_header_etags(str) + # Split header values expected as "value1", "value2", ... into an array of strings + str.scan(/((?:"(?:\\.|[^"])+?"|[^",]+)+)(?:,\s*|\Z)/xn).collect{|v| v[0].strip } + end + + def makeETag(st) + # Format file stat object into an ETag using its size & mtime + # Parameter can either be a filename or a stat object + st = File.stat(st) unless st.respond_to?(:ino) + sprintf('"%x-%x"', st.size, st.mtime.to_i) + end + + def gzip(data, level) + gz = "" + io = StringIO.new(gz) + gzw = Zlib::GzipWriter.new(io, level) + gzw.write data + gzw.close + gz + end + + def gunzip(data) + io = StringIO.new(data) + gzw = Zlib::GzipReader.new(io) + gunz = gzw.read + gzw.close + gunz + end + +end diff --git a/lib/gorg/cache.rb b/lib/gorg/cache.rb new file mode 100644 index 0000000..543b6a2 --- /dev/null +++ b/lib/gorg/cache.rb @@ -0,0 +1,493 @@ +### Copyright 2004, Xavier Neys (neysx@gentoo.org) +# # +# # This file is part of gorg. +# # +# # gorg is free software; you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation; either version 2 of the License, or +# # (at your option) any later version. +# # +# # gorg is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with Foobar; if not, write to the Free Software +### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +# Cache a bit of data based on +# . a path name as received by a webserver e.g. +# . a list of parameters as received by a webserver e.g. +# . a list of files it depends on + +require "parsedate" +require "fileutils" +require "find" +require "digest" +require "digest/md5" + +module Gorg + +CacheStamp = "Gorg-#{Gorg::Version} Cached This Data. Do not alter this file. Thanks." + +module Cache + def Cache.init(config) + @@lockfile = ".cache.cleaner.lock" + @cacheDir = nil + if FileTest.directory?(config["cacheDir"]) + if FileTest.writable?(config["cacheDir"]) + @cacheDir = config["cacheDir"].chomp("/") + else + warn "Cache directory not writable" + end + else + warn "Invalid cache directory" + end + + # Time-To-Live in seconds, cached items older than that will be considered too old + @zipLevel = config["zipLevel"] + @zip = @zipLevel > 0 ? ".gz" : "" + @ttl = config["cacheTTL"] + @cacheTree = config["cacheTree"] + @maxFiles = config["maxFiles"] # Max number of files in a single directory + @maxSize = config["cacheSize"]*1024*1024 # Now in bytes + @washNumber = config["cacheWash"] # Clean cache dir after a store operation whenever rand(@washNumber) < 10 + @lastCleanup = Time.new-8e8 # Remember last time we started a cleanup so we don't pile them up + end + + def Cache.hit(objPath, objParam={}, etags=nil, ifmodsince=nil) + # objPath is typically a requested path passed from a web request but it + # can be just any string. It is not checked against any actual files on the file system + # + # objParam is expected to be a hash or any object whose iterator yields two values + # + # 2 filenames are built with the arguments and should give + # the name of a metafile and a result file + # if the result file is older than @ttl seconds, hit fails + # The metafile is then checked for dependencies + # It contains a list of filenames along with their size and mtime separated by ;; + + # etag and ifmodsince are used in a webserver context + # etag is defined if an ETag was part of an If-None-Match request field + # etag can be an array or a single string + # If the current ETag of the meta file matches, no data is returned (webserver should return a 304) + # + # ifmodsince is a time object passed on an If-Modified-Since request field + # If the creation date of the meta file is earlier, no data is returned (webserver should return a 304) + + return nil if @cacheDir.nil? # Not initialized, ignore request + + # Reminder: filenames are full path, no need to prepend dirname + dirname, basename, filename, metaname = makeNames(objPath, objParam) + + raise "Cache subdir does not exist" unless FileTest.directory?(dirname) + + # Hit the cache + meta, mstat = IO.read(metaname), File.stat(metaname) if metaname && FileTest.file?(metaname) && FileTest.readable?(metaname) + raise "Empty/No meta file" if meta.nil? || meta.length < 1 + + fstat = File.stat(filename) if filename && FileTest.file?(filename) + raise "Empty/No data file" if fstat.nil? + + # Check the timestamps of files in the metadata + meta = meta.split("\n") + raise "I did not write that meta file" unless CacheStamp == meta.shift + mline = meta.shift + while mline and mline !~ /^;;extra meta$/ do + f, s, d = mline.split(";;") + if s.to_i < 0 + # File did not exist when cache entry was created + raise "Required file #{f} has (re)appeared" if FileTest.file?(f) && FileTest.readable?(f) + else + # File did exist when cache entry was created, is it still there? + raise "Required file #{f} has disappeared" unless FileTest.file?(f) && FileTest.readable?(f) + + fst = File.stat(f) + raise "Size of #{f} has changed from #{fst.size} to #{s.to_i}" unless fst.size == s.to_i + raise "Timestamp of #{f} has changed" unless Time.utc(*ParseDate.parsedate(d)) == fst.mtime.utc + end + mline = meta.shift + end + if mline =~ /^;;extra meta$/ then + extrameta = meta.dup + else + extrameta = [] + end + + if notModified?(fstat, etags, ifmodsince) and extrameta.join !~ /set-cookie/i + raise Gorg::Status::NotModified.new(fstat) + end + + file = IO.read(filename) if filename && FileTest.file?(filename) && FileTest.readable?(filename) + raise "Empty/No data file" if file.nil? || file.length < 1 + + # Is the data file too old + raise "Data file too old" unless @ttl==0 or (Time.new - fstat.mtime) < @ttl + + # Update atime of files, ignore failures as files might have just been removed + begin + t = Time.new + File.utime(t, fstat.mtime, filename) + File.utime(t, mstat.mtime, metaname) + rescue + nil + end + + # If we get here, it means the data file can be used, return cache object (data, stat(datafile), extrameta) + # The file is left (un)compressed, it's returned as it was stored + [file, fstat, extrameta] + + rescue Gorg::Status::NotModified + # Nothing changed, should return a 304 + debug("Client cache is up-to-date") + raise + rescue + # cache hit fails if anything goes wrong, no exception raised + debug("Cache hit on #{objPath} failed: (#{$!})") + nil + end + + + def Cache.store(data, objPath, objParam={}, deps=[], extrameta=[]) + # Store data in cache so it can be retrieved based on the objPath and objParams + # deps should contain a list of files that the object depends on + # as returnd by our xsl processor, i.e. an array of [access_type, path] where + # access_type can be "r", "w", or "o" for recpectively read, write, other. + + # Define content-type + ct = setContentType(data) + extrameta << "Content-Type:#{ct}" + + return nil if @cacheDir.nil? # Not initialized, ignore request + + # Cache only if no remote objects (ftp:// or http://) in list of used files + if deps && deps.detect{|f| f[0] =~ /^o$/i } + debug "#{objPath} not cached because it needs remote resources" + return nil + end + + dirname, basename, filename, metaname = makeNames(objPath, objParam) + + FileUtils.mkdir_p(dirname) unless FileTest.directory?(dirname) + + # Write Meta file to a temp file (with .timestamp.randomNumber appended) + metaname_t = "#{metaname}.#{Time.new.strftime('%Y%m%d%H%M%S')}.#{rand(9999)}" + + # Data might need to be just a link to another .Data file + # if we find another requested path with different params but + # with identical MD5 sums + # Which is why we keep a ...xml.Data.[md5 sum] file without the parameters + # in its name that we can hard link to. + # e.g. A moron hits for 10 full handbooks with toto=1..10 in the URI, + # we'd end up with 10 identical large copies. With links we have only one + + # Old versions are expected to be cleaned up by the cacheWash() routine + # A Dir.glob() to find the previous ones would be too expensive + + # Compute MD5 digest + md5 = Digest::MD5.hexdigest(data) + + # Compress data if required + if @zipLevel > 0 then + bodyZ = data = gzip(data, @zipLevel) + else + bodyZ = nil + end + + # Set mtime of data file to latest mtime of all required files + # so that caching can work better because mtimes will be + # identical on all webnodes whereas creation date of data + # would be different on all nodes. + maxmtime = Time.now-8e8 + fstat = nil + + begin + timeout(10){ + File.open("#{metaname_t}", "w") {|fmeta| + fmeta.puts(CacheStamp) + # Write filename;;size;;mtime for each file in deps[] + deps.each {|ffe| + ftype = ffe[0] + fdep = ffe[1] + if FileTest.file?(fdep) + s = File.stat(fdep) + fmeta.puts("#{fdep};;#{s.size};;#{s.mtime.utc};;#{ftype}") + maxmtime = s.mtime if s.mtime > maxmtime and ftype =~ /^r$/i + else + # A required file does not exist, use size=-1 and old timestamp + # so that when the file comes back, the cache notices a difference + # and no cache miss gets triggered as long as file does not exist + fmeta.puts("#{fdep};;-1;;Thu Nov 11 11:11:11 UTC 1971") + end + } + fmeta.puts ";;extra meta" + extrameta.each { |m| fmeta.puts m } + } + # Get exclusive access to the cache directory while moving files and/or creating data files + File.open(dirname) { |lockd| + while not lockd.flock(File::LOCK_NB|File::LOCK_EX) + # Timeout does not occur on a blocking lock + # Try a non-bloking one repeatedly for a few seconds until timeout occurs or lock is granted + # We are in a timeout block, remember + sleep 0.1 + end + # Remove previous Data + FileUtils.rm_rf(filename) + + # mv temp meta file to meta file + FileUtils.mv(metaname_t, metaname) + + # We keep a data file for the same requested path, with different params, + # but which ends up with same MD5 sum, i.e. identical results because of unused params + linkname = "#{basename}.#{md5}#{@zip}" + if FileTest.file?(linkname) then + # Data file already there, link to it + File.link(linkname, filename) + else + # Write data file and set its mtime to latest of all files it depends on + File.open("#{filename}", "w") {|fdata| fdata.write(data)} + # Create link + File.link(filename, linkname) + end + # mtime might need to be updated, or needs to be set + # e.g. when a dependency had changed but result files is identical + # This is needed to keep Last-Modified dates consistent across web nodes + File.utime(Time.now, maxmtime, filename) + fstat = File.stat(filename) + } + } + ensure + FileUtils.rm_rf(metaname_t) + end + + # Do we clean the cache? + washCache(dirname, 10) if @washNumber > 0 and rand(@washNumber) < 10 + + # Return stat(datafile) even if it's just been removed by washCache + # because another web node might still have it or will have it. + # Anyway, the cached item would be regenerated on a later request + # and a 304 would be returned if still appropriate at the time. + + # Return fstat of data file (for etag...) and zipped file + [fstat, bodyZ] + + rescue Timeout::Error, StandardError =>ex + if ex.class.to_s =~ /timeout::error/i then + warn("Timeout in cache store operation") + else + warn("Cache store error (#{$!})") + end + # Clean up before leaving + FileUtils.rm_rf(filename||"") + FileUtils.rm_rf(metaname||"") + nil # return nil so that caller can act if a failed store really is a problem + end + + + def Cache.washCache(dirname, tmout=30, cleanTree=false) + # Clean cache entries that are either too old compared to TTL (in seconds) + # or reduce total size to maxSize (in MB) + # oldDataOnly means to look only for unused *.Data.[md5] files that are not used anymore + # because file has been modified and has generated a new *.Data.[md5] file + + # timeout is the maximum time (in seconds) spent in here + + return nil if @cacheDir.nil? # Not initialized, ignore request + + # Also ignore request if dirname not equal to @cacheDir or under it + return nil unless dirname[0, @cacheDir.length] == @cacheDir + + # Also ignore request if dirname does not exist yet + return nil unless FileTest.directory?(dirname) + + # Also return if less than a minute has elapsed since latest cleanup + t0 = Time.new + return nil if t0 - @lastCleanup < 60 + + # Remember for next time + @lastCleanup = t0 + + Dir.chdir(dirname) { |d| + # Recreate lock file if it's been lost + unless File.exist?(@@lockfile) + File.open(@@lockfile, "w") { |lockf| lockf.puts("Lock file created on #{Time.now.utc} by gorg")} + end + + # Grab lockfile + File.open(@@lockfile) { |lockf| + if lockf.flock(File::LOCK_NB|File::LOCK_EX) then + infoMsg = "Cleaning up cache in #{dirname} (cleanTree=#{cleanTree}, tmout=#{tmout})" + info(infoMsg) + puts infoMsg if cleanTree + + timeout(tmout) { + totalSize, deletedFiles, scannedDirectories = washDir(dirname, cleanTree) + if totalSize >= 0 then + # Size == -1 means dir was locked, throwing an exception would have been nice :) + infoMsg = if cleanTree then + "Cache in #{dirname} is now #{totalSize/1024/1024} MB, #{deletedFiles} files removed in #{(Time.now-t0).to_i} seconds in #{scannedDirectories} directories" + else + "#{deletedFiles} files removed in #{(Time.now-t0).to_i} seconds in #{dirname}" + end + info(infoMsg) + puts infoMsg if cleanTree + end + } + else + # Locked dir, another process is busy cleaning up/ + debug("#{dirname} locked, skipping") + puts("#{dirname} locked, skipping") if cleanTree + end # of lock test + } # end of File.open(@@lockfile), close & release lock automatically + } + rescue Timeout::Error + info("Timeout while cleaning #{dirname}") + puts("Timeout while cleaning #{dirname}") if cleanTree + rescue StandardError =>ex + error("Error while cleaning cache: #{ex}") + puts("Error while cleaning cache: #{ex}") if cleanTree + end + + + private + + def Cache.washDir(dirname, cleanTree) + # Clean up cache starting from dirname and in subdirectories if cleanTree is true + # Return [newSize in bytes, # deleted files, # scanned directories] + size = nDeleted = nDirectories = 0 + + Dir.chdir(dirname) { |d| + hIno = Hash.new(0) # hash of file inodes with more than one link + lst = Array.new # array of file names, atime, ... + ttl = @ttl + ttl = 8e8 if ttl == 0 # No ttl, keep very old docs! + + # Get list of files sorted on their dirname+atime + Find.find('.') { |f| + begin + unless f =~ /^\.$|#{@@lockfile}/ # ignore "." and lockfile + ff = File.stat(f) + if ff.directory? then + Find.prune unless cleanTree + elsif ff.file? and f =~ /Meta|Data/ then + hIno[ff.ino] = ff.nlink if ff.nlink > 1 + # List of files has [name, atime, size, # links, inode] + lst << [f, ff.atime, ff.size, ff.nlink, ff.ino] + end + end + rescue + nil # File.stat can fail because file could have been deleted, ignore error + end + } + + # Compute total size + size = lst.inject(0){ |tot, a| tot + if a[3] > 0 then a[2]/a[3] else 0 end } + + # Delete old *.Data.[md5] files that are not being referenced anymore/ + lst.each { |a| + if a[3] == 1 && a[0] =~ /\.Data\.[0-9a-f]+(.gz)?$/ then + # Data file with no more links pointing to it + FileUtils.rm_rf(a[0]) + nDeleted += 1 + size -= a[2] + a[3] = 0 # Mark as deleted + end + } + + # Sort all files on atime + lst.sort!{ |a1, a2| a1[1] <=> a2[1] } + + t0 = Time.new + # Clean until size < maxSize _AND_ atime more recent than TTL + lst.each { |a| + break if size < @maxSize and t0-a[1] < ttl + next if a[3] < 1 # Already deleted in previous step + FileUtils.rm_rf(a[0]) + nDeleted += 1 + # Total size -= file size IF last link to data + if a[3] == 1 || hIno[a[4]] <= 1 then + size -= a[2] + end + hIno[a[4]] -= 1 if hIno[a[4]] > 0 + a[3] = 0 # Mark as deleted by setting nlinks to 0 + } + + # Remove deleted files from array + lst.reject! { |a| a[3] < 1 } + + + # Sort files per directory to enforce maxFiles + if cleanTree then + # Split the array in an array per directory + # and keep the files sorted on atime in each directory + slst = Hash.new + lst.length.times { + a = lst.shift + d = File.dirname(a[0]) + if slst[d] then + slst[d] << a + else + slst[d] = [a] + end + } + else + # If not cleaning whole tree, we have only a single dir + slst = {"." => lst} + end + + nDirectories = slst.length + + slst.each { |d, lst| + # Remove oldest files so that we have less than @maxFiles in it + if lst.length >= @maxFiles then + # Remove to leave up 90% of #maxFiles so we don't clean up only a handful of files repeatedly + (lst.length - 9*@maxFiles/10).times { + if a = lst.shift then + FileUtils.rm_rf(a[0]) + nDeleted += 1 + # Total size -= file size IF last link to data + if a[3] == 1 || hIno[a[4]] <= 1 then + size -= a[2] + end + hIno[a[4]] -= 1 if hIno[a[4]] > 0 + end + } + end + } + } #end of chdir + [size, nDeleted, nDirectories] + end + + + def Cache.makeNames(obj, params) + # Build meta filename and data filename from arguments + # + # obj is broken into a path and a filename with appended params + # e.g. /proj/en/index.xml?style=printable becomes /proj/en and index.xml+printable+yes + # or .#proj#en#index.xml+printable+yes + # depending on cacheTree param value + + # .Meta and .Data are appended respectively to the meta filename and data filename + # Base is the filename without appending params, e.g. .#proj#en#index.xml.Data + if @cacheTree then + # Use a path and a file + dir = "#{@cacheDir}#{File.dirname(obj)}" + base = f = File.basename(obj) + else + # Convert full path into a single filename + dir = @cacheDir + base = f = ".#{obj.gsub(/\//,'#')}" + end + + f = "#{f}+#{params.reject{|k,v| k.nil?}.sort.join('+')}" if params && params.to_a.length > 0 + # Remove funky chars and squeeze duplicates into single chars + f = f.gsub(/[^\w\#.+_-]/, "~").squeeze("~.#+") + + # Return names for Data and Meta files, and just the filepath (e.g. #proj#en#index.xml) + [dir, "#{dir}/#{base}.Data", "#{dir}/#{f}.Data#{@zip}", "#{dir}/#{f}.Meta"] + end +end + +end diff --git a/lib/gorg/cgi-bin/gorg.cgi b/lib/gorg/cgi-bin/gorg.cgi new file mode 100755 index 0000000..3c75dbc --- /dev/null +++ b/lib/gorg/cgi-bin/gorg.cgi @@ -0,0 +1,45 @@ +#! /usr/bin/ruby + +### Copyright 2004, Xavier Neys (neysx@gentoo.org) +# # +# # This file is part of gorg. +# # +# # gorg is free software; you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation; either version 2 of the License, or +# # (at your option) any later version. +# # +# # gorg is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with Foobar; if not, write to the Free Software +### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +require 'cgi' + +require 'gorg/cgi' + +if ARGV.length == 1 and ['-F', '--filter'].include?(ARGV[0]) then + # cgi does not accept any params like gorg, + # Only test on -F or --filter being there and nothing else + do_Filter unless STDIN.tty? +else + # Make CGI's env public to get access to REQUEST_URI and DOCUMENT_ROOT + class CGI + public :env_table + end + + include Gorg + + # Config file is named in env var. GORG_CONF, or possibly REDIRECT_GORG_CONF + ENV["GORG_CONF"] = ENV["GORG_CONF"]||ENV["REDIRECT_GORG_CONF"] + + gorgInit + STDERR.close + + cgi = CGI.new + do_CGI(cgi) +end diff --git a/lib/gorg/cgi-bin/search.cgi b/lib/gorg/cgi-bin/search.cgi new file mode 100755 index 0000000..396001e --- /dev/null +++ b/lib/gorg/cgi-bin/search.cgi @@ -0,0 +1,50 @@ +#! /usr/bin/ruby + +### Copyright 2004, Xavier Neys (neysx@gentoo.org) +# # +# # This file is part of gorg. +# # +# # gorg is free software; you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation; either version 2 of the License, or +# # (at your option) any later version. +# # +# # gorg is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with gorg; if not, write to the Free Software +### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +require 'cgi' +require 'gorg/search' + +# Make CGI's env public to get access to REQUEST_URI and DOCUMENT_ROOT +class CGI + public :env_table +end + +include Gorg + +# Config file is named in env var. GORG_CONF, or possibly REDIRECT_GORG_CONF +# ENV["PATH"] is used as a dirty hackish workaround a limitation of +# webrick's cgi handler: environment variables can't be passed to cgi's +# (REDIRECT_)GORG_CONF should be defined when running cgi's under apache +ENV["GORG_CONF"] = ENV["GORG_CONF"]||ENV["REDIRECT_GORG_CONF"]||ENV["PATH"] + +gorgInit +cgi = CGI.new + +# Params +# +# l = language code, no param will default to en, empty param defaults to any) +# q = query string +# p = page number in search result (0 < p < 1e6) +# s = page size (9 < p < 120) +# b = boolean search (y|Y|1 means yes, anything else no) + +gs = GDig::GSearch.new +gs.do_CGI(cgi) diff --git a/lib/gorg/cgi.rb b/lib/gorg/cgi.rb new file mode 100644 index 0000000..dfe8451 --- /dev/null +++ b/lib/gorg/cgi.rb @@ -0,0 +1,198 @@ +### Copyright 2004, Xavier Neys (neysx@gentoo.org) +# # +# # This file is part of gorg. +# # +# # gorg is free software; you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation; either version 2 of the License, or +# # (at your option) any later version. +# # +# # gorg is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with Foobar; if not, write to the Free Software +### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +# Process CGI request, either from cgi or fcgi + +require "gorg/base" + +module Gorg + def do_Filter(tmout=30, params=nil) + # Read STDIN, transform, spit result out + timeout(tmout) { + # Give it a few seconds to read it all, then timeout + xml = STDIN.read + err, body, filelist = xproc(xml, params, false, true) + if err["xmlErrLevel"] > 0 then + STDERR.puts("#{err.collect{|e|e.join(':')}.join("\n")}") + elsif (body||"").length < 1 then + # Some transforms can yield empty content + STDERR.puts("Empty body") + else + STDOUT.puts(body) + end + } + rescue Timeout::Error, StandardError =>ex + # Just spew it out + STDERR.puts(ex) + end + + def do_CGI(cgi) + header = Hash.new + if cgi.path_info.nil? || cgi.env_table["REQUEST_URI"].index("/#{File.basename($0)}/") + # Sorry, I'm not supposed to be called directly, e.g. /cgi-bin/gorg.cgi/bullshit_from_smartass_skriptbaby + raise Gorg::Status::Forbidden + elsif cgi.request_method == "OPTIONS" + cgi.out('Allow'=>'GET,HEAD'){""} + elsif cgi.request_method == "HEAD" or cgi.request_method == "GET" + # lighttp is b0rked despite what they say :( + # PATH_INFO == "" and PATH_TRANSLATED == nil + if cgi.path_info.length > 0 then + # Apache, or any web browser that works + path_info = cgi.path_info + else + # lighttp, use SCRIPT_NAME instead + path_info = cgi.env_table['SCRIPT_NAME'] + end + query = Hash.new + cgi.params.each{ |p, v| query[p] = v.to_s} + # Get DOCUMENT_ROOT from environment + $Config["root"] = cgi.env_table['DOCUMENT_ROOT'] + + xml_file = cgi.path_translated||(cgi.env_table['DOCUMENT_ROOT']+cgi.env_table['SCRIPT_NAME']) + if not FileTest.file?(xml_file) + # Should have been checked by apache, check anyway + raise Gorg::Status::NotFound + else + # Process request + # Parse If-None-Match and If-Modified-Since request header fields if any + inm=ims=nil + begin + inm = split_header_etags(cgi.env_table['HTTP_IF_NONE_MATCH']) if cgi.env_table['HTTP_IF_NONE_MATCH'] + ims = Time.parse(cgi.env_table['HTTP_IF_MODIFIED_SINCE']) if cgi.env_table['HTTP_IF_MODIFIED_SINCE'] + ims = nil if ims > Time.now # Dates later than current must be ignored + rescue + # Just ignore ill-formated data + nil + end + if $Config['passthru'] && query["passthru"] && query["passthru"] != "0" then + # passthru allowed by config and requested by visitor, return file as text/plain + debug("Passthru granted for #{path_info}") + mstat = File.stat(xml_file) + raise Gorg::Status::NotModified.new(mstat) if notModified?(mstat, inm, ims) + body = IO.read(xml_file) + header['type'] = 'text/plain' + # If client accepts gzip encoding and we support it, return gzipped file + if $Config["zipLevel"] > 0 and ( cgi.accept_encoding =~ /gzip(\s*;\s*q=([0-9\.]+))?/ and ($2||"1") != "0" ) then + body = gzip(body, $Config["zipLevel"]) + header['Content-Encoding'] = "gzip" + header['Vary'] = "Accept-Encoding" + end + else + # Get cookies and add them to the parameters + if $Config["acceptCookies"] then + # Add cookies to our params + query.merge!(cookies_to_params(cgi.cookies)) + end + + if $Config["httphost"] then + # Add HTTP_HOST to stylesheet params + query["httphost"] = if $Config["httphost"][0] == '*' then + cgi.host||"" + elsif $Config["httphost"].include?('*') then + $Config["httphost"][0] + elsif $Config["httphost"].include?(cgi.host) then + $Config["httphost"][0] + else + cgi.host||"" + end + end + + xml_query = query.dup # xml_query==params passed to the XSL, query=>metadata in cache + if $Config["linkParam"] then + xml_query[$Config["linkParam"]] = path_info + end + + bodyZ = nil # Compressed version + body, mstat, extrameta = Cache.hit(path_info, query, inm, ims) + if body.nil? then + # Cache miss, process file and cache result + err, body, filelist, extrameta = xproc(xml_file, xml_query, true) + if err["xmlErrLevel"] > 0 then + raise "#{err.collect{|e|e.join(':')}.join('<br/>')}" + elsif (body||"").length < 1 then + # Some transforms can yield empty content (handbook?part=9&chap=99) + # Consider this a 404 + raise Gorg::Status::NotFound + else + # Cache the output if all was OK + mstat, bodyZ = Cache.store(body, path_info, query, filelist, extrameta) + debug("Cached #{path_info}, mstat=#{mstat.inspect}") + # Check inm & ims again as they might match if another web node had + # previously delivered the same data + if notModified?(mstat, inm, ims) and extrameta.join !~ /set-cookie/i + raise Gorg::Status::NotModified.new(mstat) + end + end + else + if $Config["zipLevel"] > 0 then + bodyZ = body + body = nil + end + end + # If client accepts gzip encoding and we support it, return gzipped file + if bodyZ and $Config["zipLevel"] > 0 and ( cgi.accept_encoding =~ /gzip(\s*;\s*q=([0-9\.]+))?/ and ($2||"1") != "0" ) then + body = bodyZ + header['Content-Encoding'] = "gzip" + header['Vary'] = "Accept-Encoding" + else + unless body then + # We need to unzip bodyZ into body, i.e. we cached zipped data but client does not support gzip + body = gunzip(bodyZ) + end + end + # Add cookies to http header + cookies = makeCookies(extrameta) + if cookies then + header['cookie'] = cookies + end + # Add Content-Type to header + ct = contentType(extrameta) + if ct then + # Turn application/xhtml+xml into text/html if browser does not accept it + if cgi.accept !~ /application\/xhtml\+xml/ and ct =~ /application\/xhtml\+xml(.*)$/ then + header['type'] = "text/html#{$1}" + else + header['type'] = ct + end + else + header['type'] = 'text/plain' + end + end + # Add ETag & Last-Modified http headers + # NB: it's simply mstat(file.xml) when passthru=1 + if mstat then + header['ETag'] = makeETag(mstat) + header['Last-Modified'] = mstat.mtime.httpdate + end + end + cgi.out(header){body} + else # Not a HEAD or GET + raise Gorg::Status::NotAllowed + end + rescue => ex + if ex.respond_to?(:errCode) then + # One of ours (Gorg::Status::HTTPStatus) + cgi.out(ex.header){ex.html} + else + # Some ruby exceptions occurred, make it a 500 + syserr = Gorg::Status::SysError.new + cgi.out('Status'=>syserr.errSts){syserr.html(ex)} + error("do_CGI() failed: #{$!}") + end + end +end diff --git a/lib/gorg/fcgi-bin/gorg.fcgi b/lib/gorg/fcgi-bin/gorg.fcgi new file mode 100755 index 0000000..1f81cf2 --- /dev/null +++ b/lib/gorg/fcgi-bin/gorg.fcgi @@ -0,0 +1,61 @@ +#! /usr/bin/ruby + +### Copyright 2004, Xavier Neys (neysx@gentoo.org) +# # +# # This file is part of gorg. +# # +# # gorg is free software; you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation; either version 2 of the License, or +# # (at your option) any later version. +# # +# # gorg is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with gorg; if not, write to the Free Software +### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +require 'cgi' +require 'fcgi' + +# Overload read_from_cmdline to avoid crashing when request method +# is neither GET/HEAD/POST. Default behaviour is to read input from +# STDIN. Not really useful when your webserver gets OPTIONS / :-( +class CGI + module QueryExtension + def read_from_cmdline + '' + end + end +end + + +require 'gorg/cgi' + +include Gorg + +gorgInit +STDERR.close + +# Should I commit suicide after a while, life can be so boring! +ak47 = $Config["autoKill"]||0 + +countReq = 0; t0 = Time.new +# Process CGI requests sent by the fastCGI engine +FCGI.each_cgi do |cgi| + countReq += 1 + do_CGI(cgi) + # Is it time to leave? + # If maximum number of requests has been exceeded _AND_ at least 1 full minute has gone by + if ak47 > 0 && countReq >= ak47 && Time.new - t0 > 60 then + info("Autokill : #{countReq} requests have been processed in #{Time.new-t0} seconds") + Process.kill("USR1",$$) + else + # Garbage Collect regularly to help keep memory + # footprint low enough without costing too much time. + GC.start if countReq%50==0 + end +end diff --git a/lib/gorg/log.rb b/lib/gorg/log.rb new file mode 100644 index 0000000..4ef05d6 --- /dev/null +++ b/lib/gorg/log.rb @@ -0,0 +1,56 @@ +### Copyright 2004, Xavier Neys (neysx@gentoo.org) +# # +# # This file is part of gorg. +# # +# # gorg is free software; you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation; either version 2 of the License, or +# # (at your option) any later version. +# # +# # gorg is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with gorg; if not, write to the Free Software +### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +# Write logging info for our little gorg + +require 'syslog' +require 'webrick/log' + +module Gorg + # Make log functions available as if we were inside a log instance + # If no $Log global variable has been initialized, do nothing + def fatal(msg) $Log.fatal(msg) if $Log; end + def error(msg) $Log.error(msg) if $Log; end + def warn(msg) $Log.warn(msg) if $Log; end + def info(msg) $Log.info(msg) if $Log; end + def debug(msg) $Log.debug(msg) if $Log; end + + module Log + + class MyLog < WEBrick::BasicLog + # Interface to WEBrick log system + # Not much to add at this time ;-) + end + + class MySyslog + # Interface to syslog + def initialize(appname) + # Open syslog if not already done (only one open is allowed) + @@syslog = Syslog.open(appname) unless defined?(@@syslog) + # Make sure messages get through (WEBrick has its own filter) + @@syslog.mask = Syslog::LOG_UPTO(Syslog::LOG_ERR) + end + + def <<(str) + # WEBrick's logging requires the << method + # Just forward string to syslog + @@syslog.err(str) + end + end + end +end diff --git a/lib/gorg/search.rb b/lib/gorg/search.rb new file mode 100644 index 0000000..c90448a --- /dev/null +++ b/lib/gorg/search.rb @@ -0,0 +1,444 @@ +### Copyright 2004, Xavier Neys (neysx@gentoo.org) +# # +# # This file is part of gorg. +# # +# # gorg is free software; you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation; either version 2 of the License, or +# # (at your option) any later version. +# # +# # gorg is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with Foobar; if not, write to the Free Software +### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +require 'dbi' +require 'yaml' +require 'gorg/base' +require 'cgi' + +module GDig + class GFile + + def initialize(root, f, xlang) + @root = root + @fname = f + @xpath2lang = xlang + end + + def txt + unless @txt then + @txt, @lang = txtifyFile + end + @txt + end + + def lang + unless @lang then + @txt, @lang = txtifyFile + end + @lang + end + + private + + def txtifyFile + x=Gorg::XSL.new + x.xsl = <<EOXSL +<?xml version="1.0" encoding="UTF-8"?> + <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"> + <xsl:output encoding="UTF-8" method="text" indent="no"/> + <xsl:template match="/"> +EOXSL + if (@xpath2lang||"").length > 1 then + x.xsl << <<EOXSL + <xsl:if test="#{@xpath2lang}"> + <xsl:value-of select="concat('%%LANG%%', #{@xpath2lang}, '%%
')"/> + </xsl:if> +EOXSL + end + x.xsl << <<EOXSL + <xsl:apply-templates/> + </xsl:template> + <xsl:template match="*"> + <xsl:apply-templates select="@*"/> + <xsl:apply-templates/> + </xsl:template> + <xsl:template match="@*"> + <xsl:value-of select="concat(' ',.,' ')"/> + </xsl:template> + </xsl:stylesheet> +EOXSL + x.xroot = @root + x.xml = @fname + x.process + + if x.xerr and x.xerr["xmlErrLevel"] >= 3 then + raise x.xerr["xmlErrMsg"] + end + + t = x.xres + if t =~ /^%%LANG%%([^%]+)%%/ then + l = $1 + t = $'.strip + else + l = nil + end + t << @fname + [t.squeeze("\n"), l] + end + end + + class DBFile + attr_reader :fid, :webname + def initialize(dbh, webname, localname) + @dbh = dbh + @webname = webname + @localname = localname + @row = @dbh.select_one("SELECT id,path,lang,timestamp,size FROM files where path = ?", webname) + if @row then + @fid = @row['id'] + else + @fid = nil + end + end + + def DBFile.remove(dbh, fid) + if fid then + dbh.do("delete from files where id=#{fid}") + end + end + + def uptodate? + if @fid then + unless @row then + @row = @dbh.select_one("SELECT id,path,lang,timestamp,size FROM files where id=#{@fid}") + end + if (fstat=File.stat(@localname)) and @row then + @row['timestamp']==fstat.mtime.to_s and @row['size']==fstat.size + else + false + end + end + end + + def update(blob, lang) + fstat=File.stat(@localname) + if @fid then + # update + sql = "update files set lang = ?, txt = ?, timestamp = ?, size = ? where id=#{@fid}" + @dbh.do(sql, lang, blob, fstat.mtime.to_s, fstat.size) + else + # insert new one + sql = "insert into files (path, lang, txt, timestamp, size) values (?, ?, ?, ?, ?)" + @dbh.do(sql, webname, lang, blob, fstat.mtime.to_s, fstat.size) + if id=@dbh.select_one("select last_insert_id()") then + @fid = id[0] + else + @fid = nil + end + end + end + end + + class GSearch + attr_reader :dbh, :searchTxt, :searchResult + include Gorg + + def initialize + @dbh = DBI.connect($Config['dbConnect'], $Config['dbUser'], $Config['dbPassword']) + @dbh['AutoCommit'] = true + end + + def indexDir + wipe = false + scanDir { |webName, localName| + begin + dbf = GDig::DBFile.new(@dbh, webName, localName) + unless dbf.uptodate? then + gf = GFile.new($Config['root'], webName, $Config['xlang']) + blob = gf.txt + lang = gf.lang + if (lang||"").length < 1 then + # No lang attribute, see if we can use the filename + if $Config['flang'] and $Config['flang'].match(webName) then + lang = $Config['flang'].match(webName)[1] + end + end + dbf.update(blob, lang) + wipe = true + debug "#{Time.new.to_i} #{webName} indexed" + end + rescue Exception => e + error "Failed to index #{webName} : #{e.to_s}" + end + } + wipeSearches if wipe + end + + def cleanup + # Remove files from db either because + # they should now be excluded or because they do not exist anymore + wipe = false + @dbh.select_all('select id, path from files') { |row| + if not fileMatch(row[1]) or not File.file?($Config['root']+row[1]) then + DBFile.remove(@dbh, row[0]) + debug "GDig::GSearch: #{row[1]} removed" + wipe = true + end + } + wipeSearches if wipe + end + + def do_CGI(cgi) + $Config["root"] = cgi.env_table['DOCUMENT_ROOT']||$Config["root"] + query = {} + # Get cookies + if $Config["acceptCookies"] then + # Add cookies to our params + query = cookies_to_params(cgi.cookies) + end + # Add URI params that are not used by search engine (p,q,l,s) + cgi.params.each{ |p, v| query[p] = v.to_s} + + # Choose language + if cgi.has_key?("l") then + lang = cgi["l"] + elsif query.has_key?("SL") then + lang = query["SL"] + else + lang = nil + end + + # Perform search + search(cgi["q"], lang) + + if cgi.has_key?("p") and cgi["p"] =~ /^[0-9]{1,5}$/ then + p = cgi["p"].to_i + else + p = 1 + end + + if cgi.has_key?("s") and cgi["s"] =~ /^[0-9]{2,3}$/ then + s = cgi["s"].to_i + elsif query.has_key?("PL") and query["PL"] =~ /^[0-9]{2,3}$/ then + s = query["PL"].to_i + else + s = 20 + end + s = 120 if s > 120 + + xml = xmlResult(p,s) + header = {}; body = "" + if cgi.has_key?("passthru") and $Config["passthru"] then + header = {'type' => 'text/plain'} + body = xml + else + if $Config["linkParam"] then + query[$Config["linkParam"]] = cgi.script_name + end + if $Config["httphost"] then + # Add HTTP_HOST to stylesheet params + query["httphost"] = if $Config["httphost"][0] == '*' then + cgi.host||"" + elsif $Config["httphost"].include?('*') then + $Config["httphost"][0] + elsif $Config["httphost"].include?(cgi.host) then + $Config["httphost"][0] + else + cgi.host + end + end + + err, body, filelist, extra = xproc(xml, query, false) + if err["xmlErrLevel"] > 0 then + raise "#{err.collect{|e|e.join(':')}.join('<br/>')}" + end + cookies = makeCookies(extra) + ct = setContentType(body) + # Turn application/xhtml+xml into text/html if browser does not accept it + if cgi.accept !~ /application\/xhtml\+xml/ and ct =~ /application\/xhtml\+xml(.*)$/ then + header = {'type' => "text/html#{$1}"} + else + header = {'type' => ct} + end + + # Add cookies to http header + if cookies then + header['cookie'] = cookies + end + end + # If client accepts gzip encoding and we support it, return gzipped file + if $Config["zipLevel"] > 0 and ( cgi.accept_encoding =~ /gzip(\s*;\s*q=([0-9\.]+))?/ and ($2||"1") != "0" ) then + body = gzip(body, $Config["zipLevel"]) + header['Content-Encoding'] = "gzip" + header['Vary'] = "Accept-Encoding" + end + cgi.out(header){body} + rescue => ex + syserr = Gorg::Status::SysError.new + cgi.out('Status'=>syserr.errSts){syserr.html(ex)} + error("GSearch::do_CGI() failed: #{$!}") + end + + def search(str, lang) + @searchTxt = str + @searchResult = nil + if (lang||"") == "" then + @searchLang = '%' + else + @searchLang = lang + end + if str =~ /(^|\s)(([+<)(>~-][^+<)(>~-]+)|([^+<)(>~-]+\*))(\s|$)/ then + @searchBool = "Y" + boolClause = "in boolean mode" + else + @searchBool = "N" + boolClause = "" + end + if @searchTxt.length > 0 then + @searchResult = loadSearch + unless @searchResult then + @searchResult = [] + # Perform full text search + sql = <<EOSQL +select id, path, lang, match (txt) against ( ? ) as score +from files +where lang like ? and match (txt) against ( ? #{boolClause} ) +order by score desc +EOSQL + @dbh.select_all(sql, @searchTxt, @searchLang, @searchTxt).each { |r| @searchResult << [r[0],r[1],r[2],r[3]] } + saveSearch + end + end + @searchResult + end + + def xmlResult(page=1, pageLength=25) + # <search page="p" pages="n"> + # <for>search string</for> + # <found link="/path/to/file.xml" lang="fr"> + # blah blah <b>word2</b> bleh + # </found> + pageLength = 20 if pageLength < 1 + xml = "<?xml version='1.0' encoding='UTF-8'?>\n\n" + + if @searchResult and @searchResult.length >= 1 then + removeDeadFiles + nPages = @searchResult.length / pageLength #/ + nPages += 1 unless 0 == @searchResult.length.modulo(pageLength) + page = nPages if page > nPages + page = 1 if page < 1 + + xml << "<search page='#{page}' pages='#{nPages}' pageLength='#{pageLength}' lang='#{xmlEscape(@searchLang)}' bool='#{@searchBool}'>\n" + xml << xmlSearchFor + @searchResult[(page-1)*pageLength..page*pageLength-1].each { |r| + xml << " <found link='#{r[1]}' lang='#{r[2]}' score='#{r[3]}'>\n" + xml << xmlBlobSample(r[0]) << "\n" + xml << " </found>\n" + } + else + xml << "<search page='0' pages='0'>\n" + xml << xmlSearchFor + end + xml << "</search>\n" + end + + def scanDir + Dir.chdir($Config['root']) { + `find -L . -type f`.split("\n").each{ |localFile| + if File.file?(localFile) then + webFile = localFile[1..-1] + if fileMatch(webFile) then + yield [webFile, File.expand_path(localFile)] + end + end + } + } + end + + private + + def xmlBlobSample(fileID) + blob = "" + r = @dbh.select_one("select txt from files where id = #{fileID}") + if r then + blob = r[0] + # Find first matching word and extract some text around it + stxt = @searchTxt.tr('`.,\'"\-_+~<>/?;:[]{}+|\\)(*&^%\$\#@!', ' ').split(' ') + regs = stxt.collect { |w| Regexp.new(w, true, 'U') } + ix = nil + regs.each { |r| break if ix=blob.index(r) } + if ix then + if ix < 80 then + x = 0 + else + x = blob[0,ix-60].rindex(/[ ,\.]/) + x = 0 unless x + end + y = blob.index(/[,\. ]/, ix+80) + y = -1 unless y + blob = xmlEscape(blob[x..y]) + # Mark up sought words + regs.each { |r| blob.gsub!(r){|t| "<b>#{t}</b>"} } + else + x = blob[120..-1].index(/[ ,\.]/) + blob = xmlEscape(blob[0..x]) + end + end + blob + end + + def xmlEscape(str) + if str + str.gsub('&','&').gsub('>','>').gsub('<','<') + else + "w00t" + end + end + + def loadSearch + if @searchTxt then + r = @dbh.select_one("select result from savedsearches where words = ? and lang = ? and bool = ?", @searchTxt, @searchLang, @searchBool) + if r then + YAML::load(r[0]) + end + end + end + + def saveSearch + if @searchTxt then + @dbh.do("delete from savedsearches where words = ? and lang = ? and bool = ?", @searchTxt, @searchLang, @searchBool) + @dbh.do("insert into savedsearches (words, lang, bool, result) values(?, ?, ?, ?)", @searchTxt, @searchLang, @searchBool, @searchResult.to_yaml) + end + end + + def wipeSearches + @dbh.do("delete from savedsearches") + end + + def fileMatch(f) + $Config['in/out'].each { |inout| + return inout[0] if inout[1].match(f) + } + false + end + + def removeDeadFiles + if @searchResult then + @searchResult.reject!{ |r| not File.file?($Config['root']+r[1]) } + end + end + + def xmlSearchFor + " <for>#{xmlEscape(@searchTxt)}</for>\n" if @searchTxt + end + + end + +end diff --git a/lib/gorg/www.rb b/lib/gorg/www.rb new file mode 100644 index 0000000..eb0c8fa --- /dev/null +++ b/lib/gorg/www.rb @@ -0,0 +1,207 @@ +### Copyright 2004, Xavier Neys (neysx@gentoo.org) +# # +# # This file is part of gorg. +# # +# # gorg is free software; you can redistribute it and/or modify +# # it under the terms of the GNU General Public License as published by +# # the Free Software Foundation; either version 2 of the License, or +# # (at your option) any later version. +# # +# # gorg is distributed in the hope that it will be useful, +# # but WITHOUT ANY WARRANTY; without even the implied warranty of +# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# # GNU General Public License for more details. +# # +# # You should have received a copy of the GNU General Public License +# # along with gorg; if not, write to the Free Software +### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +# Run the stand-alone webserver and serve gentoo.org + +require 'gorg/base' +require 'webrick' +require 'cgi' + +class GentooServlet < WEBrick::HTTPServlet::FileHandler + include Gorg + + def do_GET(req, res) + hit = "#{$Config["root"]}#{req.path}" + cacheName = req.path + if FileTest.directory?(hit) and FileTest.exist?(hit+"/index.xml") then + # Use $URI/index.xml for directories that have an index.xml file + hit << "/index.xml" + cacheName << "/index.xml" + end + hit.squeeze!('/') + cacheName.squeeze!('/') + if FileTest.directory?(hit) then + super # Use default FileHandler for directories that have no index.xml + else + if hit !~ /\.(xml)|(rdf)|(rss)$/ then + super # Use default FileHandler if not an xml file + else + if not FileTest.exist?(hit) then + super # Use default FileHandler to handle 404 (file does not exist) + else + # Parse If-None-Match and If-Modified-Since request header fields if any + ims=inm=nil + begin + ims = Time.parse(req['if-modified-since']) if req['if-modified-since'] + inm = split_header_etags(req['if-none-match']) if req['if-none-match'] + rescue + # Just ignore ill-formated data + nil + end + begin + res['Charset'] = 'UTF-8' + # Process xml file or return xml file if passthru=1 + if $Config['passthru'] && req.query && req.query["passthru"] && req.query["passthru"] != "0" then + # passthru allowed by config and requested by visitor, return file as text/plain + mstat = File.stat(hit) + raise Gorg::Status::NotModified.new(mstat) if notModified?(mstat, inm, ims) + debug("Passthru granted for #{hit}") + body = IO.read(hit) + # If client accepts gzip encoding and we support it, return gzipped file + if $Config["zipLevel"] > 0 and (req.accept_encoding.include?("gzip") or req.accept_encoding.include?("x-gzip")) then + res.body = gzip(body, $Config["zipLevel"]) + res['Content-Encoding'] = "gzip" + res['Vary'] = "Accept-Encoding" + else + res.body = body + end + res['Content-Type'] = 'text/plain' + else + query_params = req.query.dup + # Get cookies and add them to the parameters + if $Config["acceptCookies"] then + # We need CGI:Cookie objects to be compatible with our cgi modules (stupid WEBrick) + ck = req.raw_header.find{|l| l =~ /^cookie: /i} + if ck then + query_params.merge!(cookies_to_params(CGI::Cookie.parse($'.strip))) + debug "query params are " + query_params.inspect + end + end + if $Config["httphost"] then + # Add HTTP_HOST to stylesheet params + query_params["httphost"] = if $Config["httphost"][0] == '*' then + req.host||"" + elsif $Config["httphost"].include?('*') then + $Config["httphost"][0] + elsif $Config["httphost"].include?(req.host) then + $Config["httphost"][0] + else + req.host||"" + end + end + + bodyZ = nil + body, mstat, extrameta = Gorg::Cache.hit(cacheName, query_params, inm, ims) + if body.nil? then + xml_query = query_params.dup + if $Config["linkParam"] then + xml_query[$Config["linkParam"]] = req.path + end + # Cache miss, process file and cache result + err, body, filelist, extrameta = xproc(hit, xml_query, true) + warn("#{err.collect{|e|e.join(':')}.join('; ')}") if err["xmlErrLevel"] == 1 + error("#{err.collect{|e|e.join(':')}.join('; ')}") if err["xmlErrLevel"] > 1 + # Display error message if any, just like the cgi/fcgi versions + raise ("#{err.collect{|e|e.join(':')}.join('<br/>')}") if err["xmlErrLevel"] > 0 + # Cache output + mstat, bodyZ = Gorg::Cache.store(body, cacheName, query_params, filelist, extrameta) + else + if $Config["zipLevel"] > 0 then + bodyZ = body + body = nil + end + end + # If client accepts gzip encoding and we support it, return gzipped file + if bodyZ and $Config["zipLevel"] > 0 and (req.accept_encoding.include?("gzip") or req.accept_encoding.include?("x-gzip")) then + res.body = bodyZ + res['Content-Encoding'] = "gzip" + res['Vary'] = "Accept-Encoding" + else + if body then + res.body = body + else + # We need to unzip bodyZ into body, i.e. we cached zipped data but client does not support gzip + res.body = gunzip(bodyZ) + end + end + # Add cookies to http header + cookies = makeCookies(extrameta) + if cookies then + cookies.each{|c| res.cookies << c.to_s} + end + # Add Content-Type to header + ct = contentType(extrameta).split(';')[0] + if ct then + # Turn application/xhtml+xml into text/html if browser does not accept it + if req.accept.to_s !~ /application\/xhtml\+xml/ and ct =~ /application\/xhtml\+xml(.*)$/ then + res['Content-Type'] = "text/html#{$1}" + else + res['Content-Type'] = ct + end + else + res['Content-Type'] = 'text/plain' + end + end + if mstat then + res['ETag'] = makeETag(mstat) + res['Last-Modified'] = mstat.mtime.httpdate + end + rescue => ex + if ex.respond_to?(:errCode) then + # One of ours (Gorg::Status::HTTPStatus) + res.body = ex.html + res.status = ex.errCode + ex.header.each {|k,v| res[k]=v unless k =~ /status|cookie/i} + else + # Some ruby exceptions occurred, make it a syserr + syserr = Gorg::Status::SysError.new + res.body = syserr.html(ex) + res.status = syserr.errCode + end + end + end + end + end + end +end + +### +#|# Start Here +### + +def www + # Log accesses to either stderr, syslog or a file + if $Config["accessLog"] == "syslog" + # Use syslog again, use our own format based on default but without timestamp + access_log = [ [ @syslog, "HIT %h \"%r\" %s %b" ] ] + STDERR.close + elsif $Config["accessLog"] == "stderr" + # Use syslog again, use our own format based on default but without timestamp + access_log = [ [ STDERR, "HIT %h \"%r\" %s %b" ] ] + else + # Open file and use it, if it's not writable, tough! + access_log_stream = File.open($Config["accessLog"], "a") + access_log = [ [ access_log_stream, WEBrick::AccessLog::COMBINED_LOG_FORMAT ] ] + STDERR.close + end + + s = WEBrick::HTTPServer.new( :BindAddress => $Config["listen"], :AccessLog=>access_log, :Logger => $Log, :Port => $Config["port"], :CGIPathEnv => ENV["GORG_CONF"]) + + # Mount directories + $Config["mounts"].each { |m| + s.mount(m[0], WEBrick::HTTPServlet::FileHandler, m[1]) + } + s.mount("/", GentooServlet, $Config["root"]) + + # Start server + trap("INT"){ s.shutdown } + + puts "\n\nStarting the Gorg web server on #{$Config['listen']}:#{$Config['port']}\n\nHit Ctrl-C or type \"kill #{$$}\" to stop it\n\n" + + s.start +end diff --git a/setup.rb b/setup.rb new file mode 100644 index 0000000..3bee28e --- /dev/null +++ b/setup.rb @@ -0,0 +1,1360 @@ +# +# setup.rb +# +# Copyright (c) 2000-2004 Minero Aoki +# +# This program is free software. +# You can distribute/modify this program under the terms of +# the GNU LGPL, Lesser General Public License version 2.1. +# + +unless Enumerable.method_defined?(:map) # Ruby 1.4.6 + module Enumerable + alias map collect + end +end + +unless File.respond_to?(:read) # Ruby 1.6 + def File.read(fname) + open(fname) {|f| + return f.read + } + end +end + +def File.binread(fname) + open(fname, 'rb') {|f| + return f.read + } +end + +# for corrupted windows stat(2) +def File.dir?(path) + File.directory?((path[-1,1] == '/') ? path : path + '/') +end + + +class SetupError < StandardError; end + +def setup_rb_error(msg) + raise SetupError, msg +end + +# +# Config +# + +if arg = ARGV.detect {|arg| /\A--rbconfig=/ =~ arg } + ARGV.delete(arg) + require arg.split(/=/, 2)[1] + $".push 'rbconfig.rb' +else + require 'rbconfig' +end + +def multipackage_install? + FileTest.directory?(File.dirname($0) + '/packages') +end + + +class ConfigItem + def initialize(name, template, default, desc) + @name = name.freeze + @template = template + @value = default + @default = default.dup.freeze + @description = desc + end + + attr_reader :name + attr_reader :description + + attr_accessor :default + alias help_default default + + def help_opt + "--#{@name}=#{@template}" + end + + def value + @value + end + + def eval(table) + @value.gsub(%r<\$([^/]+)>) { table[$1] } + end + + def set(val) + @value = check(val) + end + + private + + def check(val) + setup_rb_error "config: --#{name} requires argument" unless val + val + end +end + +class BoolItem < ConfigItem + def config_type + 'bool' + end + + def help_opt + "--#{@name}" + end + + private + + def check(val) + return 'yes' unless val + unless /\A(y(es)?|n(o)?|t(rue)?|f(alse))\z/i =~ val + setup_rb_error "config: --#{@name} accepts only yes/no for argument" + end + (/\Ay(es)?|\At(rue)/i =~ value) ? 'yes' : 'no' + end +end + +class PathItem < ConfigItem + def config_type + 'path' + end + + private + + def check(path) + setup_rb_error "config: --#{@name} requires argument" unless path + path[0,1] == '$' ? path : File.expand_path(path) + end +end + +class ProgramItem < ConfigItem + def config_type + 'program' + end +end + +class SelectItem < ConfigItem + def initialize(name, template, default, desc) + super + @ok = template.split('/') + end + + def config_type + 'select' + end + + private + + def check(val) + unless @ok.include?(val.strip) + setup_rb_error "config: use --#{@name}=#{@template} (#{val})" + end + val.strip + end +end + +class PackageSelectionItem < ConfigItem + def initialize(name, template, default, help_default, desc) + super name, template, default, desc + @help_default = help_default + end + + attr_reader :help_default + + def config_type + 'package' + end + + private + + def check(val) + unless File.dir?("packages/#{val}") + setup_rb_error "config: no such package: #{val}" + end + val + end +end + +class ConfigTable_class + + def initialize(items) + @items = items + @table = {} + items.each do |i| + @table[i.name] = i + end + ALIASES.each do |ali, name| + @table[ali] = @table[name] + end + end + + include Enumerable + + def each(&block) + @items.each(&block) + end + + def key?(name) + @table.key?(name) + end + + def lookup(name) + @table[name] or raise ArgumentError, "no such config item: #{name}" + end + + def add(item) + @items.push item + @table[item.name] = item + end + + def remove(name) + item = lookup(name) + @items.delete_if {|i| i.name == name } + @table.delete_if {|name, i| i.name == name } + item + end + + def new + dup() + end + + def savefile + '.config' + end + + def load + begin + t = dup() + File.foreach(savefile()) do |line| + k, v = *line.split(/=/, 2) + t[k] = v.strip + end + t + rescue Errno::ENOENT + setup_rb_error $!.message + "#{File.basename($0)} config first" + end + end + + def save + @items.each {|i| i.value } + File.open(savefile(), 'w') {|f| + @items.each do |i| + f.printf "%s=%s\n", i.name, i.value if i.value + end + } + end + + def [](key) + lookup(key).eval(self) + end + + def []=(key, val) + lookup(key).set val + end + +end + +c = ::Config::CONFIG + +rubypath = c['bindir'] + '/' + c['ruby_install_name'] + +major = c['MAJOR'].to_i +minor = c['MINOR'].to_i +teeny = c['TEENY'].to_i +version = "#{major}.#{minor}" + +# ruby ver. >= 1.4.4? +newpath_p = ((major >= 2) or + ((major == 1) and + ((minor >= 5) or + ((minor == 4) and (teeny >= 4))))) + +if c['rubylibdir'] + # V < 1.6.3 + _stdruby = c['rubylibdir'] + _siteruby = c['sitedir'] + _siterubyver = c['sitelibdir'] + _siterubyverarch = c['sitearchdir'] +elsif newpath_p + # 1.4.4 <= V <= 1.6.3 + _stdruby = "$prefix/lib/ruby/#{version}" + _siteruby = c['sitedir'] + _siterubyver = "$siteruby/#{version}" + _siterubyverarch = "$siterubyver/#{c['arch']}" +else + # V < 1.4.4 + _stdruby = "$prefix/lib/ruby/#{version}" + _siteruby = "$prefix/lib/ruby/#{version}/site_ruby" + _siterubyver = _siteruby + _siterubyverarch = "$siterubyver/#{c['arch']}" +end +libdir = '-* dummy libdir *-' +stdruby = '-* dummy rubylibdir *-' +siteruby = '-* dummy site_ruby *-' +siterubyver = '-* dummy site_ruby version *-' +parameterize = lambda {|path| + path.sub(/\A#{Regexp.quote(c['prefix'])}/, '$prefix')\ + .sub(/\A#{Regexp.quote(libdir)}/, '$libdir')\ + .sub(/\A#{Regexp.quote(stdruby)}/, '$stdruby')\ + .sub(/\A#{Regexp.quote(siteruby)}/, '$siteruby')\ + .sub(/\A#{Regexp.quote(siterubyver)}/, '$siterubyver') +} +libdir = parameterize.call(c['libdir']) +stdruby = parameterize.call(_stdruby) +siteruby = parameterize.call(_siteruby) +siterubyver = parameterize.call(_siterubyver) +siterubyverarch = parameterize.call(_siterubyverarch) + +if arg = c['configure_args'].split.detect {|arg| /--with-make-prog=/ =~ arg } + makeprog = arg.sub(/'/, '').split(/=/, 2)[1] +else + makeprog = 'make' +end + +common_conf = [ + PathItem.new('prefix', 'path', c['prefix'], + 'path prefix of target environment'), + PathItem.new('bindir', 'path', parameterize.call(c['bindir']), + 'the directory for commands'), + PathItem.new('libdir', 'path', libdir, + 'the directory for libraries'), + PathItem.new('datadir', 'path', parameterize.call(c['datadir']), + 'the directory for shared data'), + PathItem.new('mandir', 'path', parameterize.call(c['mandir']), + 'the directory for man pages'), + PathItem.new('sysconfdir', 'path', parameterize.call(c['sysconfdir']), + 'the directory for man pages'), + PathItem.new('stdruby', 'path', stdruby, + 'the directory for standard ruby libraries'), + PathItem.new('siteruby', 'path', siteruby, + 'the directory for version-independent aux ruby libraries'), + PathItem.new('siterubyver', 'path', siterubyver, + 'the directory for aux ruby libraries'), + PathItem.new('siterubyverarch', 'path', siterubyverarch, + 'the directory for aux ruby binaries'), + PathItem.new('rbdir', 'path', '$siterubyver', + 'the directory for ruby scripts'), + PathItem.new('sodir', 'path', '$siterubyverarch', + 'the directory for ruby extentions'), + PathItem.new('rubypath', 'path', rubypath, + 'the path to set to #! line'), + ProgramItem.new('rubyprog', 'name', rubypath, + 'the ruby program using for installation'), + ProgramItem.new('makeprog', 'name', makeprog, + 'the make program to compile ruby extentions'), + SelectItem.new('shebang', 'all/ruby/never', 'ruby', + 'shebang line (#!) editing mode'), + BoolItem.new('without-ext', 'yes/no', 'no', + 'does not compile/install ruby extentions') +] +class ConfigTable_class # open again + ALIASES = { + 'std-ruby' => 'stdruby', + 'site-ruby-common' => 'siteruby', # For backward compatibility + 'site-ruby' => 'siterubyver', # For backward compatibility + 'bin-dir' => 'bindir', + 'bin-dir' => 'bindir', + 'rb-dir' => 'rbdir', + 'so-dir' => 'sodir', + 'data-dir' => 'datadir', + 'ruby-path' => 'rubypath', + 'ruby-prog' => 'rubyprog', + 'ruby' => 'rubyprog', + 'make-prog' => 'makeprog', + 'make' => 'makeprog' + } +end +multipackage_conf = [ + PackageSelectionItem.new('with', 'name,name...', '', 'ALL', + 'package names that you want to install'), + PackageSelectionItem.new('without', 'name,name...', '', 'NONE', + 'package names that you do not want to install') +] +if multipackage_install? + ConfigTable = ConfigTable_class.new(common_conf + multipackage_conf) +else + ConfigTable = ConfigTable_class.new(common_conf) +end + + +module MetaConfigAPI + + def eval_file_ifexist(fname) + instance_eval File.read(fname), fname, 1 if File.file?(fname) + end + + def config_names + ConfigTable.map {|i| i.name } + end + + def config?(name) + ConfigTable.key?(name) + end + + def bool_config?(name) + ConfigTable.lookup(name).config_type == 'bool' + end + + def path_config?(name) + ConfigTable.lookup(name).config_type == 'path' + end + + def value_config?(name) + case ConfigTable.lookup(name).config_type + when 'bool', 'path' + true + else + false + end + end + + def add_config(item) + ConfigTable.add item + end + + def add_bool_config(name, default, desc) + ConfigTable.add BoolItem.new(name, 'yes/no', default ? 'yes' : 'no', desc) + end + + def add_path_config(name, default, desc) + ConfigTable.add PathItem.new(name, 'path', default, desc) + end + + def set_config_default(name, default) + ConfigTable.lookup(name).default = default + end + + def remove_config(name) + ConfigTable.remove(name) + end + +end + + +# +# File Operations +# + +module FileOperations + + def mkdir_p(dirname, prefix = nil) + dirname = prefix + File.expand_path(dirname) if prefix + $stderr.puts "mkdir -p #{dirname}" if verbose? + return if no_harm? + + # does not check '/'... it's too abnormal case + dirs = File.expand_path(dirname).split(%r<(?=/)>) + if /\A[a-z]:\z/i =~ dirs[0] + disk = dirs.shift + dirs[0] = disk + dirs[0] + end + dirs.each_index do |idx| + path = dirs[0..idx].join('') + Dir.mkdir path unless File.dir?(path) + end + end + + def rm_f(fname) + $stderr.puts "rm -f #{fname}" if verbose? + return if no_harm? + + if File.exist?(fname) or File.symlink?(fname) + File.chmod 0777, fname + File.unlink fname + end + end + + def rm_rf(dn) + $stderr.puts "rm -rf #{dn}" if verbose? + return if no_harm? + + Dir.chdir dn + Dir.foreach('.') do |fn| + next if fn == '.' + next if fn == '..' + if File.dir?(fn) + verbose_off { + rm_rf fn + } + else + verbose_off { + rm_f fn + } + end + end + Dir.chdir '..' + Dir.rmdir dn + end + + def move_file(src, dest) + File.unlink dest if File.exist?(dest) + begin + File.rename src, dest + rescue + File.open(dest, 'wb') {|f| f.write File.binread(src) } + File.chmod File.stat(src).mode, dest + File.unlink src + end + end + + def install(from, dest, mode, prefix = nil) + $stderr.puts "install #{from} #{dest}" if verbose? + return if no_harm? + + realdest = prefix ? prefix + File.expand_path(dest) : dest + realdest = File.join(realdest, File.basename(from)) if File.dir?(realdest) + str = File.binread(from) + if diff?(str, realdest) + verbose_off { + rm_f realdest if File.exist?(realdest) + } + File.open(realdest, 'wb') {|f| + f.write str + } + File.chmod mode, realdest + + File.open("#{objdir_root()}/InstalledFiles", 'a') {|f| + if prefix + f.puts realdest.sub(prefix, '') + else + f.puts realdest + end + } + end + end + + def diff?(new_content, path) + return true unless File.exist?(path) + new_content != File.binread(path) + end + + def command(str) + $stderr.puts str if verbose? + system str or raise RuntimeError, "'system #{str}' failed" + end + + def ruby(str) + command config('rubyprog') + ' ' + str + end + + def make(task = '') + command config('makeprog') + ' ' + task + end + + def extdir?(dir) + File.exist?(dir + '/MANIFEST') + end + + def all_files_in(dirname) + Dir.open(dirname) {|d| + return d.select {|ent| File.file?("#{dirname}/#{ent}") } + } + end + + REJECT_DIRS = %w( + CVS SCCS RCS CVS.adm .svn + ) + + def all_dirs_in(dirname) + Dir.open(dirname) {|d| + return d.select {|n| File.dir?("#{dirname}/#{n}") } - %w(. ..) - REJECT_DIRS + } + end + +end + + +# +# Main Installer +# + +module HookUtils + + def run_hook(name) + try_run_hook "#{curr_srcdir()}/#{name}" or + try_run_hook "#{curr_srcdir()}/#{name}.rb" + end + + def try_run_hook(fname) + return false unless File.file?(fname) + begin + instance_eval File.read(fname), fname, 1 + rescue + setup_rb_error "hook #{fname} failed:\n" + $!.message + end + true + end + +end + + +module HookScriptAPI + + def get_config(key) + @config[key] + end + + alias config get_config + + def set_config(key, val) + @config[key] = val + end + + # + # srcdir/objdir (works only in the package directory) + # + + #abstract srcdir_root + #abstract objdir_root + #abstract relpath + + def curr_srcdir + "#{srcdir_root()}/#{relpath()}" + end + + def curr_objdir + "#{objdir_root()}/#{relpath()}" + end + + def srcfile(path) + "#{curr_srcdir()}/#{path}" + end + + def srcexist?(path) + File.exist?(srcfile(path)) + end + + def srcdirectory?(path) + File.dir?(srcfile(path)) + end + + def srcfile?(path) + File.file? srcfile(path) + end + + def srcentries(path = '.') + Dir.open("#{curr_srcdir()}/#{path}") {|d| + return d.to_a - %w(. ..) + } + end + + def srcfiles(path = '.') + srcentries(path).select {|fname| + File.file?(File.join(curr_srcdir(), path, fname)) + } + end + + def srcdirectories(path = '.') + srcentries(path).select {|fname| + File.dir?(File.join(curr_srcdir(), path, fname)) + } + end + +end + + +class ToplevelInstaller + + Version = '3.3.1' + Copyright = 'Copyright (c) 2000-2004 Minero Aoki' + + TASKS = [ + [ 'all', 'do config, setup, then install' ], + [ 'config', 'saves your configurations' ], + [ 'show', 'shows current configuration' ], + [ 'setup', 'compiles ruby extentions and others' ], + [ 'install', 'installs files' ], + [ 'clean', "does `make clean' for each extention" ], + [ 'distclean',"does `make distclean' for each extention" ] + ] + + def ToplevelInstaller.invoke + instance().invoke + end + + @singleton = nil + + def ToplevelInstaller.instance + @singleton ||= new(File.dirname($0)) + @singleton + end + + include MetaConfigAPI + + def initialize(ardir_root) + @config = nil + @options = { 'verbose' => true } + @ardir = File.expand_path(ardir_root) + end + + def inspect + "#<#{self.class} #{__id__()}>" + end + + def invoke + run_metaconfigs + case task = parsearg_global() + when nil, 'all' + @config = load_config('config') + parsearg_config + init_installers + exec_config + exec_setup + exec_install + else + @config = load_config(task) + __send__ "parsearg_#{task}" + init_installers + __send__ "exec_#{task}" + end + end + + def run_metaconfigs + eval_file_ifexist "#{@ardir}/metaconfig" + end + + def load_config(task) + case task + when 'config' + ConfigTable.new + when 'clean', 'distclean' + if File.exist?(ConfigTable.savefile) + then ConfigTable.load + else ConfigTable.new + end + else + ConfigTable.load + end + end + + def init_installers + @installer = Installer.new(@config, @options, @ardir, File.expand_path('.')) + end + + # + # Hook Script API bases + # + + def srcdir_root + @ardir + end + + def objdir_root + '.' + end + + def relpath + '.' + end + + # + # Option Parsing + # + + def parsearg_global + valid_task = /\A(?:#{TASKS.map {|task,desc| task }.join '|'})\z/ + + while arg = ARGV.shift + case arg + when /\A\w+\z/ + setup_rb_error "invalid task: #{arg}" unless valid_task =~ arg + return arg + + when '-q', '--quiet' + @options['verbose'] = false + + when '--verbose' + @options['verbose'] = true + + when '-h', '--help' + print_usage $stdout + exit 0 + + when '-v', '--version' + puts "#{File.basename($0)} version #{Version}" + exit 0 + + when '--copyright' + puts Copyright + exit 0 + + else + setup_rb_error "unknown global option '#{arg}'" + end + end + + nil + end + + + def parsearg_no_options + unless ARGV.empty? + setup_rb_error "#{task}: unknown options: #{ARGV.join ' '}" + end + end + + alias parsearg_show parsearg_no_options + alias parsearg_setup parsearg_no_options + alias parsearg_clean parsearg_no_options + alias parsearg_distclean parsearg_no_options + + def parsearg_config + re = /\A--(#{ConfigTable.map {|i| i.name }.join('|')})(?:=(.*))?\z/ + @options['config-opt'] = [] + + while i = ARGV.shift + if /\A--?\z/ =~ i + @options['config-opt'] = ARGV.dup + break + end + m = re.match(i) or setup_rb_error "config: unknown option #{i}" + name, value = *m.to_a[1,2] + @config[name] = value + end + end + + def parsearg_install + @options['no-harm'] = false + @options['install-prefix'] = '' + while a = ARGV.shift + case a + when /\A--no-harm\z/ + @options['no-harm'] = true + when /\A--prefix=(.*)\z/ + path = $1 + path = File.expand_path(path) unless path[0,1] == '/' + @options['install-prefix'] = path + else + setup_rb_error "install: unknown option #{a}" + end + end + end + + def print_usage(out) + out.puts 'Typical Installation Procedure:' + out.puts " $ ruby #{File.basename $0} config" + out.puts " $ ruby #{File.basename $0} setup" + out.puts " # ruby #{File.basename $0} install (may require root privilege)" + out.puts + out.puts 'Detailed Usage:' + out.puts " ruby #{File.basename $0} <global option>" + out.puts " ruby #{File.basename $0} [<global options>] <task> [<task options>]" + + fmt = " %-24s %s\n" + out.puts + out.puts 'Global options:' + out.printf fmt, '-q,--quiet', 'suppress message outputs' + out.printf fmt, ' --verbose', 'output messages verbosely' + out.printf fmt, '-h,--help', 'print this message' + out.printf fmt, '-v,--version', 'print version and quit' + out.printf fmt, ' --copyright', 'print copyright and quit' + out.puts + out.puts 'Tasks:' + TASKS.each do |name, desc| + out.printf fmt, name, desc + end + + fmt = " %-24s %s [%s]\n" + out.puts + out.puts 'Options for CONFIG or ALL:' + ConfigTable.each do |item| + out.printf fmt, item.help_opt, item.description, item.help_default + end + out.printf fmt, '--rbconfig=path', 'rbconfig.rb to load',"running ruby's" + out.puts + out.puts 'Options for INSTALL:' + out.printf fmt, '--no-harm', 'only display what to do if given', 'off' + out.printf fmt, '--prefix=path', 'install path prefix', '$prefix' + out.puts + end + + # + # Task Handlers + # + + def exec_config + @installer.exec_config + @config.save # must be final + end + + def exec_setup + @installer.exec_setup + end + + def exec_install + @installer.exec_install + end + + def exec_show + ConfigTable.each do |i| + printf "%-20s %s\n", i.name, i.value + end + end + + def exec_clean + @installer.exec_clean + end + + def exec_distclean + @installer.exec_distclean + end + +end + + +class ToplevelInstallerMulti < ToplevelInstaller + + include HookUtils + include HookScriptAPI + include FileOperations + + def initialize(ardir) + super + @packages = all_dirs_in("#{@ardir}/packages") + raise 'no package exists' if @packages.empty? + end + + def run_metaconfigs + eval_file_ifexist "#{@ardir}/metaconfig" + @packages.each do |name| + eval_file_ifexist "#{@ardir}/packages/#{name}/metaconfig" + end + end + + def init_installers + @installers = {} + @packages.each do |pack| + @installers[pack] = Installer.new(@config, @options, + "#{@ardir}/packages/#{pack}", + "packages/#{pack}") + end + + with = extract_selection(config('with')) + without = extract_selection(config('without')) + @selected = @installers.keys.select {|name| + (with.empty? or with.include?(name)) \ + and not without.include?(name) + } + end + + def extract_selection(list) + a = list.split(/,/) + a.each do |name| + setup_rb_error "no such package: #{name}" unless @installers.key?(name) + end + a + end + + def print_usage(f) + super + f.puts 'Inluded packages:' + f.puts ' ' + @packages.sort.join(' ') + f.puts + end + + # + # multi-package metaconfig API + # + + attr_reader :packages + + def declare_packages(list) + raise 'package list is empty' if list.empty? + list.each do |name| + raise "directory packages/#{name} does not exist"\ + unless File.dir?("#{@ardir}/packages/#{name}") + end + @packages = list + end + + # + # Task Handlers + # + + def exec_config + run_hook 'pre-config' + each_selected_installers {|inst| inst.exec_config } + run_hook 'post-config' + @config.save # must be final + end + + def exec_setup + run_hook 'pre-setup' + each_selected_installers {|inst| inst.exec_setup } + run_hook 'post-setup' + end + + def exec_install + run_hook 'pre-install' + each_selected_installers {|inst| inst.exec_install } + run_hook 'post-install' + end + + def exec_clean + rm_f ConfigTable.savefile + run_hook 'pre-clean' + each_selected_installers {|inst| inst.exec_clean } + run_hook 'post-clean' + end + + def exec_distclean + rm_f ConfigTable.savefile + run_hook 'pre-distclean' + each_selected_installers {|inst| inst.exec_distclean } + run_hook 'post-distclean' + end + + # + # lib + # + + def each_selected_installers + Dir.mkdir 'packages' unless File.dir?('packages') + @selected.each do |pack| + $stderr.puts "Processing the package `#{pack}' ..." if @options['verbose'] + Dir.mkdir "packages/#{pack}" unless File.dir?("packages/#{pack}") + Dir.chdir "packages/#{pack}" + yield @installers[pack] + Dir.chdir '../..' + end + end + + def verbose? + @options['verbose'] + end + + def no_harm? + @options['no-harm'] + end + +end + + +class Installer + + FILETYPES = %w( bin lib ext data ) + + include HookScriptAPI + include HookUtils + include FileOperations + + def initialize(config, opt, srcroot, objroot) + @config = config + @options = opt + @srcdir = File.expand_path(srcroot) + @objdir = File.expand_path(objroot) + @currdir = '.' + end + + def inspect + "#<#{self.class} #{File.basename(@srcdir)}>" + end + + # + # Hook Script API base methods + # + + def srcdir_root + @srcdir + end + + def objdir_root + @objdir + end + + def relpath + @currdir + end + + # + # configs/options + # + + def no_harm? + @options['no-harm'] + end + + def verbose? + @options['verbose'] + end + + def verbose_off + begin + save, @options['verbose'] = @options['verbose'], false + yield + ensure + @options['verbose'] = save + end + end + + # + # TASK config + # + + def exec_config + exec_task_traverse 'config' + end + + def config_dir_bin(rel) + end + + def config_dir_lib(rel) + end + + def config_dir_ext(rel) + extconf if extdir?(curr_srcdir()) + end + + def extconf + opt = @options['config-opt'].join(' ') + command "#{config('rubyprog')} #{curr_srcdir()}/extconf.rb #{opt}" + end + + def config_dir_data(rel) + end + + # + # TASK setup + # + + def exec_setup + exec_task_traverse 'setup' + end + + def setup_dir_bin(rel) + all_files_in(curr_srcdir()).each do |fname| + adjust_shebang "#{curr_srcdir()}/#{fname}" + end + end + + def adjust_shebang(path) + return if no_harm? + tmpfile = File.basename(path) + '.tmp' + begin + File.open(path, 'rb') {|r| + first = r.gets + return unless File.basename(config('rubypath')) == 'ruby' + return unless File.basename(first.sub(/\A\#!/, '').split[0]) == 'ruby' + $stderr.puts "adjusting shebang: #{File.basename(path)}" if verbose? + File.open(tmpfile, 'wb') {|w| + w.print first.sub(/\A\#!\s*\S+/, '#! ' + config('rubypath')) + w.write r.read + } + move_file tmpfile, File.basename(path) + } + ensure + File.unlink tmpfile if File.exist?(tmpfile) + end + end + + def setup_dir_lib(rel) + end + + def setup_dir_ext(rel) + make if extdir?(curr_srcdir()) + end + + def setup_dir_data(rel) + end + + # + # TASK install + # + + def exec_install + rm_f 'InstalledFiles' + exec_task_traverse 'install' + end + + def install_dir_bin(rel) + install_files collect_filenames_auto(), "#{config('bindir')}/#{rel}", 0755 + end + + def install_dir_lib(rel) + install_files ruby_scripts(), "#{config('rbdir')}/#{rel}", 0644 + end + + def install_dir_ext(rel) + return unless extdir?(curr_srcdir()) + install_files ruby_extentions('.'), + "#{config('sodir')}/#{File.dirname(rel)}", + 0555 + end + + def install_dir_data(rel) + install_files collect_filenames_auto(), "#{config('datadir')}/#{rel}", 0644 + end + + def install_files(list, dest, mode) + mkdir_p dest, @options['install-prefix'] + list.each do |fname| + install fname, dest, mode, @options['install-prefix'] + end + end + + def ruby_scripts + collect_filenames_auto().select {|n| /(\.rb)|(\.cgi)|(\.fcgi)\z/ =~ n } + end + + # picked up many entries from cvs-1.11.1/src/ignore.c + reject_patterns = %w( + core RCSLOG tags TAGS .make.state + .nse_depinfo #* .#* cvslog.* ,* .del-* *.olb + *~ *.old *.bak *.BAK *.orig *.rej _$* *$ + + *.org *.in .* + ) + mapping = { + '.' => '\.', + '$' => '\$', + '#' => '\#', + '*' => '.*' + } + REJECT_PATTERNS = Regexp.new('\A(?:' + + reject_patterns.map {|pat| + pat.gsub(/[\.\$\#\*]/) {|ch| mapping[ch] } + }.join('|') + + ')\z') + + def collect_filenames_auto + mapdir((existfiles() - hookfiles()).reject {|fname| + REJECT_PATTERNS =~ fname + }) + end + + def existfiles + all_files_in(curr_srcdir()) | all_files_in('.') + end + + def hookfiles + %w( pre-%s post-%s pre-%s.rb post-%s.rb ).map {|fmt| + %w( config setup install clean ).map {|t| sprintf(fmt, t) } + }.flatten + end + + def mapdir(filelist) + filelist.map {|fname| + if File.exist?(fname) # objdir + fname + else # srcdir + File.join(curr_srcdir(), fname) + end + } + end + + def ruby_extentions(dir) + Dir.open(dir) {|d| + ents = d.select {|fname| /\.#{::Config::CONFIG['DLEXT']}\z/ =~ fname } + if ents.empty? + setup_rb_error "no ruby extention exists: 'ruby #{$0} setup' first" + end + return ents + } + end + + # + # TASK clean + # + + def exec_clean + exec_task_traverse 'clean' + rm_f ConfigTable.savefile + rm_f 'InstalledFiles' + end + + def clean_dir_bin(rel) + end + + def clean_dir_lib(rel) + end + + def clean_dir_ext(rel) + return unless extdir?(curr_srcdir()) + make 'clean' if File.file?('Makefile') + end + + def clean_dir_data(rel) + end + + # + # TASK distclean + # + + def exec_distclean + exec_task_traverse 'distclean' + rm_f ConfigTable.savefile + rm_f 'InstalledFiles' + end + + def distclean_dir_bin(rel) + end + + def distclean_dir_lib(rel) + end + + def distclean_dir_ext(rel) + return unless extdir?(curr_srcdir()) + make 'distclean' if File.file?('Makefile') + end + + # + # lib + # + + def exec_task_traverse(task) + run_hook "pre-#{task}" + FILETYPES.each do |type| + if config('without-ext') == 'yes' and type == 'ext' + $stderr.puts 'skipping ext/* by user option' if verbose? + next + end + traverse task, type, "#{task}_dir_#{type}" + end + run_hook "post-#{task}" + end + + def traverse(task, rel, mid) + dive_into(rel) { + run_hook "pre-#{task}" + __send__ mid, rel.sub(%r[\A.*?(?:/|\z)], '') + all_dirs_in(curr_srcdir()).each do |d| + traverse task, "#{rel}/#{d}", mid + end + run_hook "post-#{task}" + } + end + + def dive_into(rel) + return unless File.dir?("#{@srcdir}/#{rel}") + + dir = File.basename(rel) + Dir.mkdir dir unless File.dir?(dir) + prevdir = Dir.pwd + Dir.chdir dir + $stderr.puts '---> ' + rel if verbose? + @currdir = rel + yield + Dir.chdir prevdir + $stderr.puts '<--- ' + rel if verbose? + @currdir = File.dirname(rel) + end + +end + + +if $0 == __FILE__ + begin + if multipackage_install? + ToplevelInstallerMulti.invoke + else + ToplevelInstaller.invoke + end + rescue SetupError + raise if $DEBUG + $stderr.puts $!.message + $stderr.puts "Try 'ruby #{$0} --help' for detailed usage." + exit 1 + end +end |