1 files changed, 43 insertions, 0 deletions
diff --git a/read-index.pl b/read-index.pl
new file mode 100644
index 0000000..c279750
--- /dev/null
+++ b/read-index.pl
@@ -0,0 +1,43 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+
+my $filename = "sample.out";
+
+open(my $fh, $filename) or die "could not open $filename";
+
+my %documents;
+while (my $line=<$fh>) {
+    $line =~ /File-([^-]+)-([^:]+): ([^\n]*)\n/s;
+    my $fileid = $1; # numeric or "dist"
+    my $field = $2; # string, non-empty
+    my $value = $3; # string, may be empty
+    #print "Fileid: ". $fileid . "\n";
+    #print "field: ". $field . "\n";
+    #print "Value: ". $value . "\n";
+
+    if ( ! $documents{$fileid} ) {
+        $documents{$fileid} = { $field => $value };
+    } else {
+        $documents{$fileid}{$field} = $value;
+    }
+}
+close($fh);
+
+
+# Fields for indexing.
+
+# our %fields = (
+#     distfile => 'text',
+#     filename => 'text',
+#     isdist   => 'UnAnalyzedField',
+#     size     => 'UnAnalyzedField',
+#     mtime    => 'UnAnalyzedField',
+#     md5      => 'UnAnalyzedField',
+#     sha1     => 'UnAnalyzedField',
+# );
+
+# analyzer should simply tokenize filenames by its parts
+# i would split up by [/.-_] at least. technically, using
+# (\W|_|\d) as the class of split characters might be reasonable
+