summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'read-index.pl')
-rw-r--r--read-index.pl43
1 files changed, 43 insertions, 0 deletions
diff --git a/read-index.pl b/read-index.pl
new file mode 100644
index 0000000..c279750
--- /dev/null
+++ b/read-index.pl
@@ -0,0 +1,43 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+
+my $filename = "sample.out";
+
+open(my $fh, $filename) or die "could not open $filename";
+
+my %documents;
+while (my $line=<$fh>) {
+ $line =~ /File-([^-]+)-([^:]+): ([^\n]*)\n/s;
+ my $fileid = $1; # numeric or "dist"
+ my $field = $2; # string, non-empty
+ my $value = $3; # string, may be empty
+ #print "Fileid: ". $fileid . "\n";
+ #print "field: ". $field . "\n";
+ #print "Value: ". $value . "\n";
+
+ if ( ! $documents{$fileid} ) {
+ $documents{$fileid} = { $field => $value };
+ } else {
+ $documents{$fileid}{$field} = $value;
+ }
+}
+close($fh);
+
+
+# Fields for indexing.
+
+# our %fields = (
+# distfile => 'text',
+# filename => 'text',
+# isdist => 'UnAnalyzedField',
+# size => 'UnAnalyzedField',
+# mtime => 'UnAnalyzedField',
+# md5 => 'UnAnalyzedField',
+# sha1 => 'UnAnalyzedField',
+# );
+
+# analyzer should simply tokenize filenames by its parts
+# i would split up by [/.-_] at least. technically, using
+# (\W|_|\d) as the class of split characters might be reasonable
+