lib/gorg/cache.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493

###   Copyright 2004,   Xavier Neys   (neysx@gentoo.org)
# #
# #   This file is part of gorg.
# #
# #   gorg is free software; you can redistribute it and/or modify
# #   it under the terms of the GNU General Public License as published by
# #   the Free Software Foundation; either version 2 of the License, or
# #   (at your option) any later version.
# #
# #   gorg is distributed in the hope that it will be useful,
# #   but WITHOUT ANY WARRANTY; without even the implied warranty of
# #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# #   GNU General Public License for more details.
# #
# #   You should have received a copy of the GNU General Public License
# #   along with Foobar; if not, write to the Free Software
###   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


# Cache a bit of data based on 
#  . a path name as received by a webserver e.g.
#  . a list of parameters as received by a webserver e.g.
#  . a list of files it depends on

require "parsedate"
require "fileutils"
require "find"
require "digest"
require "digest/md5"

module Gorg

CacheStamp = "Gorg-#{Gorg::Version} Cached This Data. Do not alter this file. Thanks."

module Cache
  def Cache.init(config)
    @@lockfile = ".cache.cleaner.lock"
    @cacheDir = nil
    if FileTest.directory?(config["cacheDir"])
      if FileTest.writable?(config["cacheDir"])
        @cacheDir = config["cacheDir"].chomp("/")
      else
        warn "Cache directory not writable"
      end
    else
      warn "Invalid cache directory"
    end

    # Time-To-Live in seconds, cached items older than that will be considered too old
    @zipLevel = config["zipLevel"]
    @zip = @zipLevel > 0 ? ".gz" : ""
    @ttl = config["cacheTTL"]
    @cacheTree = config["cacheTree"]
    @maxFiles = config["maxFiles"]            # Max number of files in a single directory
    @maxSize = config["cacheSize"]*1024*1024  # Now in bytes
    @washNumber = config["cacheWash"]         # Clean cache dir after a store operation whenever rand(@washNumber) < 10
    @lastCleanup = Time.new-8e8               # Remember last time we started a cleanup so we don't pile them up
  end
  
  def Cache.hit(objPath, objParam={}, etags=nil, ifmodsince=nil)
    # objPath is typically a requested path passed from a web request but it
    # can be just any string. It is not checked against any actual files on the file system
    #
    # objParam is expected to be a hash or any object whose iterator yields two values
    #
    # 2 filenames are built with the arguments and should give 
    # the name of a metafile and a result file
    # if the result file is older than @ttl seconds, hit fails
    # The metafile is then checked for dependencies
    # It contains a list of filenames along with their size and mtime separated by ;;
    
    # etag and ifmodsince are used in a webserver context
    #   etag is defined if an ETag was part of an If-None-Match request field
    #   etag can be an array or a single string
    #   If the current ETag of the meta file matches, no data is returned (webserver should return a 304)
    #
    #   ifmodsince is a time object passed on an If-Modified-Since request field
    #   If the creation date of the meta file is earlier, no data is returned (webserver should return a 304)

    return nil if @cacheDir.nil? # Not initialized, ignore request
    
    # Reminder: filenames are full path, no need to prepend dirname
    dirname, basename, filename, metaname = makeNames(objPath, objParam)
    
    raise "Cache subdir does not exist" unless FileTest.directory?(dirname)

    # Hit the cache
    meta, mstat = IO.read(metaname), File.stat(metaname)  if metaname && FileTest.file?(metaname) && FileTest.readable?(metaname)
    raise "Empty/No meta file" if meta.nil? || meta.length < 1

    fstat = File.stat(filename) if filename && FileTest.file?(filename)
    raise "Empty/No data file" if fstat.nil?

    # Check the timestamps of files in the metadata
    meta = meta.split("\n")
    raise "I did not write that meta file" unless CacheStamp == meta.shift
    mline = meta.shift
    while mline and mline !~ /^;;extra meta$/ do
      f, s, d = mline.split(";;")
      if s.to_i < 0
        # File did not exist when cache entry was created
        raise "Required file #{f} has (re)appeared" if FileTest.file?(f) && FileTest.readable?(f)
      else
        # File did exist when cache entry was created, is it still there?
        raise "Required file #{f} has disappeared" unless FileTest.file?(f) && FileTest.readable?(f)
      
        fst = File.stat(f)
        raise "Size of #{f} has changed from #{fst.size} to #{s.to_i}" unless fst.size == s.to_i
        raise "Timestamp of #{f} has changed" unless Time.utc(*ParseDate.parsedate(d)) == fst.mtime.utc
      end
      mline = meta.shift
    end
    if mline =~ /^;;extra meta$/ then
      extrameta = meta.dup
    else
      extrameta = []
    end
    
    if notModified?(fstat, etags, ifmodsince) and extrameta.join !~ /set-cookie/i
      raise Gorg::Status::NotModified.new(fstat)
    end
    
    file = IO.read(filename) if filename && FileTest.file?(filename) && FileTest.readable?(filename)
    raise "Empty/No data file" if file.nil? || file.length < 1

    # Is the data file too old
    raise "Data file too old" unless @ttl==0 or (Time.new - fstat.mtime) < @ttl
    
    # Update atime of files, ignore failures as files might have just been removed
    begin
      t = Time.new
      File.utime(t, fstat.mtime, filename)
      File.utime(t, mstat.mtime, metaname)
    rescue
      nil
    end
    
    # If we get here, it means the data file can be used, return cache object (data, stat(datafile), extrameta)
    # The file is left (un)compressed, it's returned as it was stored
    [file, fstat, extrameta]
    
  rescue Gorg::Status::NotModified
    # Nothing changed, should return a 304
    debug("Client cache is up-to-date")
    raise
  rescue
    # cache hit fails if anything goes wrong, no exception raised
    debug("Cache hit on #{objPath} failed: (#{$!})")
    nil
  end


  def Cache.store(data, objPath, objParam={}, deps=[], extrameta=[])
    # Store data in cache so it can be retrieved based on the objPath and objParams
    # deps should contain a list of files that the object depends on
    # as returnd by our xsl processor, i.e. an array of [access_type, path] where
    # access_type can be "r", "w", or "o" for recpectively read, write, other.

    # Define content-type
    ct = setContentType(data)
    extrameta << "Content-Type:#{ct}"
    
    return nil if @cacheDir.nil? # Not initialized, ignore request
    
    # Cache only if no remote objects (ftp:// or http://) in list of used files
    if deps && deps.detect{|f| f[0] =~ /^o$/i }
      debug "#{objPath} not cached because it needs remote resources"
      return nil
    end

    dirname, basename, filename, metaname = makeNames(objPath, objParam)

    FileUtils.mkdir_p(dirname) unless FileTest.directory?(dirname)
    
    # Write Meta file to a temp file (with .timestamp.randomNumber appended)
    metaname_t = "#{metaname}.#{Time.new.strftime('%Y%m%d%H%M%S')}.#{rand(9999)}"

    # Data might need to be just a link to another .Data file
    # if we find another requested path with different params but
    # with identical MD5 sums
    # Which is why we keep a ...xml.Data.[md5 sum] file without the parameters
    # in its name that we can hard link to.
    # e.g. A moron hits for 10 full handbooks with toto=1..10 in the URI,
    # we'd end up with 10 identical large copies. With links we have only one

    # Old versions are expected to be cleaned up by the cacheWash() routine
    # A Dir.glob() to find the previous ones would be too expensive
    
    # Compute MD5 digest
    md5 = Digest::MD5.hexdigest(data)
    
    # Compress data if required
    if @zipLevel > 0 then
      bodyZ = data = gzip(data, @zipLevel)
    else
      bodyZ = nil
    end
    
    # Set mtime of data file to latest mtime of all required files
    # so that caching can work better because mtimes will be
    # identical on all webnodes whereas creation date of data
    # would be different on all nodes.
    maxmtime = Time.now-8e8
    fstat = nil
    
    begin
      timeout(10){
        File.open("#{metaname_t}", "w") {|fmeta|
          fmeta.puts(CacheStamp)
          # Write filename;;size;;mtime for each file in deps[]
          deps.each {|ffe|
            ftype = ffe[0]
            fdep = ffe[1]
            if FileTest.file?(fdep)
              s = File.stat(fdep)
              fmeta.puts("#{fdep};;#{s.size};;#{s.mtime.utc};;#{ftype}")
              maxmtime = s.mtime if s.mtime > maxmtime and ftype =~ /^r$/i
            else
              # A required file does not exist, use size=-1 and old timestamp
              # so that when the file comes back, the cache notices a difference
              # and no cache miss gets triggered as long as file does not exist
              fmeta.puts("#{fdep};;-1;;Thu Nov 11 11:11:11 UTC 1971")
            end
          }
          fmeta.puts ";;extra meta"
          extrameta.each { |m| fmeta.puts m }
        }
        # Get exclusive access to the cache directory while moving files and/or creating data files
        File.open(dirname) { |lockd|
          while not lockd.flock(File::LOCK_NB|File::LOCK_EX)
            # Timeout does not occur on a blocking lock
            # Try a non-bloking one repeatedly for a few seconds until timeout occurs or lock is granted
            # We are in a timeout block, remember
            sleep 0.1
          end
          # Remove previous Data
          FileUtils.rm_rf(filename)

          # mv temp meta file to meta file
          FileUtils.mv(metaname_t, metaname)

          # We keep a data file for the same requested path, with different params,
          # but which ends up with same MD5 sum, i.e. identical results because of unused params
          linkname = "#{basename}.#{md5}#{@zip}"
          if FileTest.file?(linkname) then
            # Data file already there, link to it
            File.link(linkname, filename)
          else
            # Write data file and set its mtime to latest of all files it depends on
            File.open("#{filename}", "w") {|fdata| fdata.write(data)}
            # Create link
            File.link(filename, linkname)
          end
          # mtime might need to be updated, or needs to be set
          # e.g. when a dependency had changed but result files is identical
          # This is needed to keep Last-Modified dates consistent across web nodes
          File.utime(Time.now, maxmtime, filename)
          fstat = File.stat(filename)
        }
      }
    ensure
      FileUtils.rm_rf(metaname_t)
    end
    
    # Do we clean the cache?
    washCache(dirname, 10) if @washNumber > 0 and rand(@washNumber) < 10
    
    # Return stat(datafile) even if it's just been removed by washCache
    # because another web node might still have it or will have it.
    # Anyway, the cached item would be regenerated on a later request
    # and a 304 would be returned if still appropriate at the time.

    # Return fstat of data file (for etag...) and zipped file
    [fstat, bodyZ]
    
  rescue Timeout::Error, StandardError =>ex
    if ex.class.to_s =~ /timeout::error/i then
      warn("Timeout in cache store operation")
    else
      warn("Cache store error (#{$!})")
    end
    # Clean up before leaving
    FileUtils.rm_rf(filename||"")
    FileUtils.rm_rf(metaname||"")
    nil # return nil so that caller can act if a failed store really is a problem
  end
    
    
  def Cache.washCache(dirname, tmout=30, cleanTree=false)
    # Clean cache entries that are either too old compared to TTL (in seconds)
    # or reduce total size to maxSize (in MB)
    # oldDataOnly means to look only for unused *.Data.[md5] files that are not used anymore 
    # because file has been modified and has generated a new *.Data.[md5] file
    
    # timeout is the maximum time (in seconds) spent in here

    return nil if @cacheDir.nil? # Not initialized, ignore request
    
    # Also ignore request if dirname not equal to @cacheDir or under it
    return nil unless dirname[0, @cacheDir.length] == @cacheDir
    
    # Also ignore request if dirname does not exist yet
    return nil unless FileTest.directory?(dirname)
    
    # Also return if less than a minute has elapsed since latest cleanup
    t0 = Time.new
    return nil if t0 - @lastCleanup < 60
    
    # Remember for next time
    @lastCleanup = t0

    Dir.chdir(dirname) { |d|
      # Recreate lock file if it's been lost
      unless File.exist?(@@lockfile)
        File.open(@@lockfile, "w") { |lockf| lockf.puts("Lock file created on #{Time.now.utc} by gorg")}
      end
        
      # Grab lockfile
      File.open(@@lockfile) { |lockf| 
        if lockf.flock(File::LOCK_NB|File::LOCK_EX) then
          infoMsg = "Cleaning up cache in #{dirname} (cleanTree=#{cleanTree}, tmout=#{tmout})"
          info(infoMsg)
          puts infoMsg if cleanTree

          timeout(tmout) {
            totalSize, deletedFiles, scannedDirectories = washDir(dirname, cleanTree)
            if totalSize >= 0 then
              # Size == -1 means dir was locked, throwing an exception would have been nice :)
              infoMsg = if cleanTree then
                          "Cache in #{dirname} is now #{totalSize/1024/1024} MB, #{deletedFiles} files removed in #{(Time.now-t0).to_i} seconds in #{scannedDirectories} directories"
                        else
                          "#{deletedFiles} files removed in #{(Time.now-t0).to_i} seconds in #{dirname}"
                        end
              info(infoMsg)
              puts infoMsg if cleanTree
            end
          }
        else
          # Locked dir, another process is busy cleaning up/
          debug("#{dirname} locked, skipping")
          puts("#{dirname} locked, skipping") if cleanTree
        end # of lock test
      } # end of File.open(@@lockfile),  close & release lock automatically
    }
  rescue Timeout::Error
    info("Timeout while cleaning #{dirname}")
    puts("Timeout while cleaning #{dirname}") if cleanTree
  rescue StandardError =>ex
    error("Error while cleaning cache: #{ex}")
    puts("Error while cleaning cache: #{ex}") if cleanTree
  end

  
  private

  def Cache.washDir(dirname, cleanTree)
    # Clean up cache starting from dirname and in subdirectories if cleanTree is true
    # Return [newSize in bytes, # deleted files, # scanned directories]
    size = nDeleted = nDirectories = 0

    Dir.chdir(dirname) { |d|
      hIno = Hash.new(0) # hash of file inodes with more than one link
      lst = Array.new    # array of file names, atime, ...
      ttl = @ttl
      ttl = 8e8 if ttl == 0 # No ttl, keep very old docs!

      # Get list of files sorted on their dirname+atime
      Find.find('.') { |f|
        begin
          unless f =~ /^\.$|#{@@lockfile}/  # ignore "." and lockfile 
            ff = File.stat(f)
            if ff.directory? then
              Find.prune unless cleanTree
            elsif ff.file? and f =~ /Meta|Data/ then
              hIno[ff.ino] = ff.nlink if ff.nlink > 1
              # List of files has [name, atime, size, # links, inode]
              lst << [f, ff.atime, ff.size, ff.nlink, ff.ino]
            end
          end
        rescue
          nil # File.stat can fail because file could have been deleted, ignore error
        end
      }
      
      # Compute total size
      size = lst.inject(0){ |tot, a| tot + if a[3] > 0 then a[2]/a[3] else 0 end }
      
      # Delete old *.Data.[md5] files that are not being referenced anymore/
      lst.each { |a|
        if a[3] == 1 && a[0] =~ /\.Data\.[0-9a-f]+(.gz)?$/ then
          # Data file with no more links pointing to it
          FileUtils.rm_rf(a[0])
          nDeleted += 1
          size -= a[2]
          a[3] = 0 # Mark as deleted
        end
      }
      
      # Sort all files on atime
      lst.sort!{ |a1, a2| a1[1] <=> a2[1] }
      
      t0 = Time.new
      # Clean until size < maxSize _AND_ atime more recent than TTL
      lst.each { |a|
        break if size < @maxSize and t0-a[1] < ttl
        next if a[3] < 1 # Already deleted in previous step
        FileUtils.rm_rf(a[0])
        nDeleted += 1
        # Total size -= file size IF last link to data
        if a[3] == 1 || hIno[a[4]] <= 1 then
          size -= a[2]
        end
        hIno[a[4]] -= 1 if hIno[a[4]] > 0
        a[3] = 0 # Mark as deleted by setting nlinks to 0
      }
      
      # Remove deleted files from array
      lst.reject! { |a| a[3] < 1 }
      
      
      # Sort files per directory to enforce maxFiles
      if cleanTree then
        # Split the array in an array per directory
        # and keep the files sorted on atime in each directory
        slst = Hash.new
        lst.length.times {
          a = lst.shift
          d = File.dirname(a[0])
          if slst[d] then
            slst[d] << a
          else
            slst[d] = [a]
          end
        }
      else
        # If not cleaning whole tree, we have only a single dir
        slst = {"." => lst}
      end
      
      nDirectories = slst.length

      slst.each { |d, lst|
        # Remove oldest files so that we have less than @maxFiles in it
        if lst.length >= @maxFiles then
          # Remove to leave up 90% of #maxFiles so we don't clean up only a handful of files repeatedly
          (lst.length - 9*@maxFiles/10).times {
            if a = lst.shift then
              FileUtils.rm_rf(a[0])
              nDeleted += 1
              # Total size -= file size IF last link to data
              if a[3] == 1 || hIno[a[4]] <= 1 then
                size -= a[2]
              end
              hIno[a[4]] -= 1 if hIno[a[4]] > 0
            end
          }
        end
      }
    } #end of chdir
    [size, nDeleted, nDirectories]
  end
                    
  
  def Cache.makeNames(obj, params)
    # Build meta filename and data filename from arguments
    #
    # obj is broken into a path and a filename with appended params
    # e.g. /proj/en/index.xml?style=printable becomes /proj/en and index.xml+printable+yes
    #  or  .#proj#en#index.xml+printable+yes
    # depending on cacheTree param value

    # .Meta and .Data are appended respectively to the meta filename and data filename
    # Base is the filename without appending params, e.g. .#proj#en#index.xml.Data
    if @cacheTree then
      # Use a path and a file
      dir = "#{@cacheDir}#{File.dirname(obj)}"
      base = f = File.basename(obj)
    else
      # Convert full path into a single filename
      dir = @cacheDir
      base = f = ".#{obj.gsub(/\//,'#')}"
    end

    f = "#{f}+#{params.reject{|k,v| k.nil?}.sort.join('+')}" if params && params.to_a.length > 0    
    # Remove funky chars and squeeze duplicates into single chars
    f = f.gsub(/[^\w\#.+_-]/, "~").squeeze("~.#+")
    
    # Return names for Data and Meta files, and just the filepath (e.g. #proj#en#index.xml)
    [dir, "#{dir}/#{base}.Data", "#{dir}/#{f}.Data#{@zip}", "#{dir}/#{f}.Meta"]
  end
end

end