Friday, February 27, 2009

Summarization with Lucene

You may have noticed that over the last couple of months, I haven't been writing too much about text mining. So I got sidetracked a bit - there's life beyond text mining, y'know :-). In any case, I am back on track to working through the remaining chapters of my TMAP book. This week, I describe a summarizer application built using Lucene.

Summarization involves reading a body of text, and summarizing it in your own words, and if done algorithmically, requires a fair amount of AI code and domain knowledge (about the text being summarized). Having seen this in action (as a consumer of the end product) at my previous job at CNET, and I can tell you that a lot of work goes into something of this sort. My goals are far less ambitious - I just want to find the "best" (most relevant) few sentences from a body of text and call it my summary.

A similar approach is taken by the two open-source summarizer applications I looked at, namely Classifier4J (C4J) and Open Text Summarizer (OTS). Based on reading this Linux.com article describing OTS and using it a bit, and looking at the sources for C4J's SimpleSummarizer, I realized that I could pick up good ideas from both applications and build one myself. For example, OTS uses XML files to specify grammar rules and a dictionary of excluded words, which could be implemented using a Lucene's PorterStemFilter and StopFilter respectively. Classifier4J tokenizes words and uses an in-memory HashMap to store a word-frequency map, which Lucene provides using its the terms() and docFreq() methods of IndexReader.

Algorithm and Code

My LuceneSummarizer tokenizes the input into paragraphs, and the paragraphs into sentences, then writes each sentence out to an in-memory Lucene index. It then computes the term frequency map of the index to find the most frequent words found in the document, takes the top few terms and hits the index with a BooleanQuery to find the most relevant sentences. The top few sentences (ordered by docId) thus found constitute the summary. The summarizer code is shown below - referenced classes are described further down.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
// Source: src/main/java/com/mycompany/myapp/summarizers/LuceneSummarizer.java
package com.mycompany.myapp.summarizers;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;

import org.apache.commons.collections15.comparators.ReverseComparator;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;

import com.mycompany.myapp.clustering.ByValueComparator;
import com.mycompany.myapp.tokenizers.ParagraphTokenizer;
import com.mycompany.myapp.tokenizers.SentenceTokenizer;

/**
 * Lucene based summarizer. Tokenizes a document into paragraphs and
 * paragraphs into sentences, then builds a in-memory lucene index for
 * the document with sentences as fields in single-field Lucene 
 * documents with index time boosts specified by the paragraph and
 * sentence number. Extracts the top terms from the in-memory index
 * and issue a Boolean OR query to the index with these terms, then
 * return the top few sentences found ordered by Lucene document id.
 */
public class LuceneSummarizer {

  private Analyzer analyzer = new StandardAnalyzer();
  private int numSentences = 2;
  private float topTermCutoff;
  // these two values are used to implement a simple linear deboost. If 
  // a different algorithm is desired, these variables are likely to be
  // no longer required.
  private float sentenceDeboost;
  private float sentenceDeboostBase = 0.5F;
  
  private ParagraphTokenizer paragraphTokenizer;
  private SentenceTokenizer sentenceTokenizer;
  
  /**
   * Allows setting a custom analyzer. Default is StandardAnalyzer
   * if not specified.
   * @param analyzer the analyzer to set.
   */
  public void setAnalyzer(Analyzer analyzer) {
    this.analyzer = analyzer;
  }
  
  /**
   * The number of sentences required in the summary. Default is 2.
   * @param numSentences the number of sentences in summary.
   */
  public void setNumSentences(int numSentences) {
    this.numSentences = numSentences;
  }
  
  /**
   * This value specifies where to cutoff the term list for query.
   * The text is loaded into an in-memory index, a sentence per
   * Lucene Document. Then the index is queried for terms and their
   * associated frequency in the index. The topTermCutoff is a 
   * ratio from 0 to 1 which specifies how far to go down the 
   * frequency ordered list of terms. The terms considered have 
   * a frequency greater than topTermCutoff * topFrequency. 
   * @param topTermCutoff a ratio specifying where the term list
   *        will be cut off. Must be between 0 and 1. Default is
   *        to consider all terms if this variable is not set,
   *        ie topTermCutoff == 0. But it is recommended to set
   *        an appropriate value (such as 0.5). 
   */
  public void setTopTermCutoff(float topTermCutoff) {
    if (topTermCutoff < 0.0F || topTermCutoff > 1.0F) {
      throw new IllegalArgumentException(
        "Invalid value: 0.0F <= topTermCutoff <= 1.0F");
    }
    this.topTermCutoff = topTermCutoff;
  }
  
  /**
   * Applies a index-time deboost to the sentences after the first
   * one in all the paragraphs after the first one. This attempts to
   * model the summarization heuristic that a summary can be generated
   * by reading the first paragraph (in full) of a document, followed
   * by the first sentence in every succeeding paragraph. The first 
   * paragraph is not deboosted at all. For the second and succeeding
   * paragraphs, the deboost is calculated as (1 - sentence_pos * deboost)
   * until the value reaches sentenceDeboostBase (default 0.5) or less, 
   * and then no more deboosting occurs. 
   * @param sentenceDeboost the deboost value to set. Must be between 
   *        0 and 1. Default is no deboosting, ie sentenceDeboost == 0.
   */
  public void setSentenceDeboost(float sentenceDeboost) {
    if (sentenceDeboost < 0.0F || sentenceDeboost > 1.0F) {
      throw new IllegalArgumentException(
        "Invalid value: 0.0F <= sentenceDeboost <= 1.0F");
    }
    this.sentenceDeboost = sentenceDeboost;
  }
  
  /**
   * This parameter is used in conjunction with sentenceDeboost. This
   * value defines the base until which deboosting will occur and then
   * stop. Default is set to 0.5 if not set. Must be between 0 and 1.
   * @param sentenceDeboostBase the sentenceDeboostBase to set.
   */
  public void setSentenceDeboostBase(float sentenceDeboostBase) {
    if (sentenceDeboostBase < 0.0F || sentenceDeboostBase > 1.0F) {
      throw new IllegalArgumentException(
        "Invalid value: 0.0F <= sentenceDeboostBase <= 1.0F");
    }
    this.sentenceDeboostBase = sentenceDeboostBase;
  }

  /**
   * The init method pre-instantiates the Paragraph and Sentence tokenizers
   * both of which are based on ICU4J RuleBasedBreakIterators, so they are
   * expensive to set up, therefore we set them up once and reuse them.
   * @throws Exception if one is thrown.
   */
  public void init() throws Exception {
    this.paragraphTokenizer = new ParagraphTokenizer();
    this.sentenceTokenizer = new SentenceTokenizer();
  }
  
  /**
   * This is the method that will be called by a client after setting up
   * the summarizer, configuring it appropriately by calling the setters,
   * and calling init() on it to instantiate its expensive objects.
   * @param text the text to summarize. At this point, the text should
   *        be plain text, converters ahead of this one in the chain
   *        should have done the necessary things to remove HTML tags,
   *        etc.
   * @return the summary in the specified number of sentences. 
   * @throws Exception if one is thrown.
   */
  public String summarize(String text) throws Exception {
    RAMDirectory ramdir = new RAMDirectory();
    buildIndex(ramdir, text);
    Query topTermQuery = computeTopTermQuery(ramdir);
    String[] sentences = searchIndex(ramdir, topTermQuery);
    return StringUtils.join(sentences, " ... ");
  }

  /**
   * Builds an in-memory index of the sentences in the text with the
   * appropriate document boosts if specified.
   * @param ramdir the RAM Directory to use.
   * @param text the text to index.
   * @throws Exception if one is thrown.
   */
  private void buildIndex(Directory ramdir, String text) throws Exception {
    if (paragraphTokenizer == null || sentenceTokenizer == null) {
      throw new IllegalArgumentException(
        "Please call init() to instantiate tokenizers");
    }
    IndexWriter writer = new IndexWriter(ramdir, analyzer,
      MaxFieldLength.UNLIMITED);
    paragraphTokenizer.setText(text);
    String paragraph = null;
    int pno = 0;
    while ((paragraph = paragraphTokenizer.nextParagraph()) != null) {
      sentenceTokenizer.setText(paragraph);
      String sentence = null;
      int sno = 0;
      while ((sentence = sentenceTokenizer.nextSentence()) != null) {
        Document doc = new Document();
        doc.add(new Field("text", sentence, Store.YES, Index.ANALYZED));
        doc.setBoost(computeDeboost(pno, sno));
        writer.addDocument(doc);
        sno++;
      }
      pno++;
    }
    writer.commit();
    writer.close();
  }

  /**
   * Applies a linear deboost function to simulate the manual heuristic of
   * summarizing by skimming the first few sentences off a paragraph.
   * @param paragraphNumber the paragraph number (0-based).
   * @param sentenceNumber the sentence number (0-based).
   * @return the deboost to apply to the current document.
   */
  private float computeDeboost(int paragraphNumber, int sentenceNumber) {
    if (paragraphNumber > 0) {
      if (sentenceNumber > 0) {
        float deboost = 1.0F - (sentenceNumber * sentenceDeboost);
        return (deboost < sentenceDeboostBase) ? 
          sentenceDeboostBase : deboost; 
      }
    }
    return 1.0F;
  }

  /**
   * Computes a term frequency map for the index at the specified location.
   * Builds a Boolean OR query out of the "most frequent" terms in the index 
   * and returns it. "Most Frequent" is defined as the terms whose frequencies
   * are greater than or equal to the topTermCutoff * the frequency of the
   * top term, where the topTermCutoff is number between 0 and 1.
   * @param ramdir the directory where the index is created.
   * @return a Boolean OR query.
   * @throws Exception if one is thrown.
   */
  private Query computeTopTermQuery(Directory ramdir) throws Exception {
    final Map<String,Integer> frequencyMap = 
      new HashMap<String,Integer>();
    List<String> termlist = new ArrayList<String>();
    IndexReader reader = IndexReader.open(ramdir);
    TermEnum terms = reader.terms();
    while (terms.next()) {
      Term term = terms.term();
      String termText = term.text();
      int frequency = reader.docFreq(term);
      frequencyMap.put(termText, frequency);
      termlist.add(termText);
    }
    reader.close();
    // sort the term map by frequency descending
    Collections.sort(termlist, new ReverseComparator<String>(
      new ByValueComparator<String,Integer>(frequencyMap)));
    // retrieve the top terms based on topTermCutoff
    List<String> topTerms = new ArrayList<String>();
    float topFreq = -1.0F;
    for (String term : termlist) {
      if (topFreq < 0.0F) {
        // first term, capture the value
        topFreq = (float) frequencyMap.get(term);
        topTerms.add(term);
      } else {
        // not the first term, compute the ratio and discard if below
        // topTermCutoff score
        float ratio = (float) ((float) frequencyMap.get(term) / topFreq);
        if (ratio >= topTermCutoff) {
          topTerms.add(term);
        } else {
          break;
        }
      }
    }
    StringBuilder termBuf = new StringBuilder();
    BooleanQuery q = new BooleanQuery();
    for (String topTerm : topTerms) {
      termBuf.append(topTerm).
        append("(").
        append(frequencyMap.get(topTerm)).
        append(");");
      q.add(new TermQuery(new Term("text", topTerm)), Occur.SHOULD);
    }
    System.out.println(">>> top terms: " + termBuf.toString());
    System.out.println(">>> query: " + q.toString());
    return q;
  }
  
  /**
   * Executes the query against the specified index, and returns a bounded
   * collection of sentences ordered by document id (so the sentence ordering
   * is preserved in the collection).
   * @param ramdir the directory location of the index.
   * @param query the Boolean OR query computed from the top terms.
   * @return an array of sentences.
   * @throws Exception if one is thrown.
   */
  private String[] searchIndex(Directory ramdir, Query query) 
      throws Exception {
    SortedMap<Integer,String> sentenceMap = 
      new TreeMap<Integer,String>();
    IndexSearcher searcher = new IndexSearcher(ramdir);
    TopDocs topDocs = searcher.search(query, numSentences);
    for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
      int docId = scoreDoc.doc;
      Document doc = searcher.doc(docId);
      sentenceMap.put(scoreDoc.doc, StringUtils.chomp(doc.get("text")));
    }
    searcher.close();
    return sentenceMap.values().toArray(new String[0]);
  }
}

Custom Analyzer

To provide stemming functionality (so that words such as concurrent and concurrency are treated as occurrences of the same word), I use Lucene's PorterStemFilter and the StopFilter with a custom stopword set from here to build a custom Analyzer for indexing the sentences. I found this article about Lucene Analyzers on Marcus Tripp's blog quite helpful. The code for my analyzer is shown below:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
// Source: src/main/java/com/mycompany/myapp/summarizers/SummaryAnalyzer.java
package com.mycompany.myapp.summarizers;

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.PorterStemFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;

/**
 * Special purpose analyzer that uses a chain of PorterStemFilter, 
 * StopFilter, LowercaseFilter and StandardFilter to wrap a 
 * StandardTokenizer. The StopFilter uses a custom stop word set
 * adapted from:
 * http://www.onjava.com/onjava/2003/01/15/examples/EnglishStopWords.txt
 * For ease of maintenance, we put these words in a flat file and
 * import them on analyzer construction.
 */
public class SummaryAnalyzer extends Analyzer {

  private Set<String> stopset;
  
  public SummaryAnalyzer() throws IOException {
    String[] stopwords = filterComments(StringUtils.split(
      FileUtils.readFileToString(new File(
      "src/main/resources/stopwords.txt"), "UTF-8")));
    this.stopset = StopFilter.makeStopSet(stopwords, true);
  }
  
  @Override
  public TokenStream tokenStream(String fieldName, Reader reader) {
    return new PorterStemFilter(
      new StopFilter(
        new LowerCaseFilter(
          new StandardFilter(
            new StandardTokenizer(reader))), stopset));
  }
  
  private String[] filterComments(String[] input) {
    List<String> stopwords = new ArrayList<String>();
    for (String stopword : input) {
      if (! stopword.startsWith("#")) {
        stopwords.add(stopword);
      }
    }
    return stopwords.toArray(new String[0]);
  }
}

The nice thing about using pre-built Lucene components is that they can be easily extended by adding new filters into the chain. For example, if we wanted to restrict our terms to be of a certain part-of-speech (say nouns), then it is quite simple to build a POSFilter that would use either the Wordnet database or a rule-based tagger such as Brill tagger.

Heuristics - Modeling Skim

One strategy used by human summarizers is a strategy called skimming that involves reading the first paragraph fully, and then reading the first few sentences from each of the succeeding paragraphs. Our LuceneSummarizer models this using a linear function that is applied to sentences after the first paragraph.

b = 1.0 - (f * i) if boost >= b0
b = b0 if boost < b0
where:
b = computed boost factor
b0 = minimum score after boost (${sentenceDeboostBase})
f = (de-)boost factor (${sentenceDeboost})
i = position of sentence in paragraph (0-based)

This results in successive sentences (from the second paragraph onwards) to be treated as less and less important, until it reaches a floor, at which point successive sentences in the paragraph get the same importance. The graph below is the actual boost values from one of my test cases, and should help visualizing the behavior of the above function.

Sentence Tokenization

To model the heuristic described above, I needed a paragraph tokenizer and a sentence tokenizer. I first tried using my old SentenceTokenizer based on Java's BreakIterator, but it would not recognize a line break as a sentence boundary, so I modified it to use the RuleBasedBreakIterator (RBBI) with default sentence break rules from the ICU4J project. That worked better, except that it would break over abbreviations such as "Mr." terminated by a period, so I had to put in a rule to suppress the break.

I describe the SentenceTokenizer first. As described here, I generate my default rules by dumping the rules from a sentence instance with RBBI.toString(). However, I had basically gotten lucky when customizing the RBBI for word tokenization - this time around, I really needed to understand the default rules, so looking through the commented sentence rules source file and the Unicode spec upon which it is based was practically mandatory for doing any customization. My rule file, including my $AbbrevWord rule, is shown below. Along with my own comments, I pulled in the source file comments as well, so its fairly easy to understand.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# Source: src/main/resources/sentence_break_rules.txt
# Default sentence break rules augmented with custom rules
# See: http://unicode.org/reports/tr29
# For a description of the variables involved.
# The initial file was generated by instantiating an RBBI.sentenceInstance
# and then RBBI.toString() to dump out the ruleset, then modified to suit
# the application.

# Character categories as defined in TR29 (see URL above).
$VoiceMarks   = [\uff9e\uff9f];
$Sep       = [\p{Sentence_Break = Sep}];
$Format    = [\p{Sentence_Break = Format}];
$Sp        = [\p{Sentence_Break = Sp}];
$Lower     = [\p{Sentence_Break = Lower}];
$Upper     = [\p{Sentence_Break = Upper}];
$OLetter   = [\p{Sentence_Break = OLetter}-$VoiceMarks];
$Numeric   = [\p{Sentence_Break = Numeric}];
$ATerm     = [\p{Sentence_Break = ATerm}];
$STerm     = [\p{Sentence_Break = STerm}];
$Close     = [\p{Sentence_Break = Close}];
$CR         = \u000d;
$LF         = \u000a;
$Extend     = [[:Grapheme_Extend = TRUE:]$VoiceMarks];
# Extended forms of the character classes, incorporate
# trailing Extend or Format chars. Rules 4 and 5
$SpEx       = $Sp      ($Extend | $Format)*;
$LowerEx    = $Lower   ($Extend | $Format)*;
$UpperEx    = $Upper   ($Extend | $Format)*;
$OLetterEx  = $OLetter ($Extend | $Format)*;
$NumericEx  = $Numeric ($Extend | $Format)*;
$ATermEx    = $ATerm   ($Extend | $Format)*;
$STermEx    = $STerm   ($Extend | $Format)*;
$CloseEx    = $Close   ($Extend | $Format)*;
# Abbreviations: words such as Mr. or George W. Bush should not break
# a sentence. An abbreviation is defined as an uppercase alpha char 
# followed by {0,3} lowercase alphas followed by a period
$AbbrevWord = ($UpperEx | ($UpperEx $LowerEx) | ($UpperEx $LowerEx{2}) \
  | ($UpperEx $LowerEx{3})) $ATerm;

!!chain;

!!forward;
# Rule 3 - break after separators. Keep CR/LF together.
$CR $LF {100};
# Rule 4 - Break after paragraph separator $Sep
# Rule 5 - Ignore $Format and $Extend
[^$Sep]? ($Extend | $Format)* {101};
# Rule 6 - Don't break after ambiguous terminator if its immediately
# followed by a number or lowercase letter.
# Replaced this condition to include optional space between the ambiguous
# terminator and number.
#$ATermEx $NumericEx {102};
$ATermEx ($SpEx)? $NumericEx {102};
# Rule 7 - Don't break if ambiguous terminator $ATerm is between two
# uppercase letters
$UpperEx $ATermEx $UpperEx {103};
# Rule 8 - Don't break if ambiguous terminator followed by other 
# continuation punctuation such as comma, colon, semicolon, etc.
$NotLettersEx = [^$OLetter $Upper $Lower $Sep $ATerm $STerm] \
  ($Extend | $Format)* {104};
$ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower {105};
# Rule added for recognizing $AbbrevWord
$AbbrevWord $CloseEx* $SpEx* $NotLettersEx* \
  ($Lower | $Upper | $NumericEx) {110};
# Rule 8a
($STermEx | $ATermEx) $CloseEx* $SpEx* ($STermEx | $ATermEx) {106};
# Rule 9, 10, 11 - Break after sentence terminator, but include closing 
# punctuation, trailing spaces and paragraph separator (if present).
($STermEx | $ATermEx) $CloseEx* $SpEx* $Sep? {107};
[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend]{bof}] \
  ($Extend | $Format | $Close | $Sp)* . {108};
[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend]{bof}] \
  ($Extend | $Format | $Close | $Sp)* ([$Sep{eof}] | $CR $LF) {109};
# Rule 12 - otherwise... Don't break

!!reverse;
$SpEx_R       = ($Extend | $Format)* $Sp;
$ATermEx_R    = ($Extend | $Format)* $ATerm;
$STermEx_R    = ($Extend | $Format)* $STerm;
$CloseEx_R    = ($Extend | $Format)* $Close;
[{bof}] (.? | $LF $CR) [^$Sep]* [$Sep {eof}] ($SpEx_R* $CloseEx_R* \
  ($STermEx_R | $ATermEx_R))*;

!!safe_forward;

!!safe_reverse;

The code for the SentenceTokenizer is almost unchanged, the one difference is that we instantiate the BreakIterator differently. Instead of using a static sentence instance, we instantiate the RBBI from the sentence rule file. The choice of RBBI over the Java's BreakIterator also resulted in the additional init() method, where I instantiate the Paragraph and Sentence tokenizers, since they need to compile the rules on startup, and are hence expensive resources to instantiate.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
// Source: src/main/java/com/mycompany/myapp/tokenizers/SentenceTokenizer.java
package com.mycompany.myapp.tokenizers;

import java.io.File;
import java.text.BreakIterator;

import org.apache.commons.io.FileUtils;

import com.ibm.icu.text.RuleBasedBreakIterator;

/**
 * Tokenizes the input into sentences. Uses ICU4J's RuleBasedBreakIterator
 * with rule file adapted from a dump of RBBI.sentenceInstance.
 */
public class SentenceTokenizer {

  private String text;
  private int index = 0;
  private RuleBasedBreakIterator breakIterator;
  
  public SentenceTokenizer() throws Exception {
    super();
    this.breakIterator = new RuleBasedBreakIterator(
      FileUtils.readFileToString(
      new File("src/main/resources/sentence_break_rules.txt"), "UTF-8"));
  }
  
  public void setText(String text) {
    this.text = text;
    this.breakIterator.setText(text);
    this.index = 0;
  }
  
  public String nextSentence() {
    int end = breakIterator.next();
    if (end == BreakIterator.DONE) {
      return null;
    }
    String sentence = text.substring(index, end);
    index = end;
    return sentence;
  }
}

Paragraph Tokenization

Neither Java's BreakIterator nor ICU4J's RBBI provides a paragraph iterator, but it is fairly simple (once you understand the rules) to modify the sentence rules to not break on certain separators and thereby create a set of rules for paragraph tokenization. Here is my file containing rules for paragraph tokenization for RBBI.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Source: src/main/resources/paragraph_break_rules.txt
# Default sentence break rules augmented with custom rules
# See: http://unicode.org/reports/tr29
# For a description of the variables involved.
# The initial file was generated by instantiating an RBBI.sentenceInstance
# and then RBBI.toString() to dump out the ruleset, then modified to suit
# the application. Specifically, rules 9-11 have been modified to not match
# on STerm (sentence terminator) and ATerm (ambiguous terminator).

# Character categories as defined in TR29 (see URL above).
$VoiceMarks   = [\uff9e\uff9f];
$Sep       = [\p{Sentence_Break = Sep}];
$Format    = [\p{Sentence_Break = Format}];
$Sp        = [\p{Sentence_Break = Sp}];
$Lower     = [\p{Sentence_Break = Lower}];
$Upper     = [\p{Sentence_Break = Upper}];
$OLetter   = [\p{Sentence_Break = OLetter}-$VoiceMarks];
$Numeric   = [\p{Sentence_Break = Numeric}];
$ATerm     = [\p{Sentence_Break = ATerm}];
$STerm     = [\p{Sentence_Break = STerm}];
$Close     = [\p{Sentence_Break = Close}];
$CR         = \u000d;
$LF         = \u000a;
$Extend     = [[:Grapheme_Extend = TRUE:]$VoiceMarks];
# Extended forms of the character classes, incorporate
# trailing Extend or Format chars. Rules 4 and 5
$SpEx       = $Sp      ($Extend | $Format)*;
$LowerEx    = $Lower   ($Extend | $Format)*;
$UpperEx    = $Upper   ($Extend | $Format)*;
$OLetterEx  = $OLetter ($Extend | $Format)*;
$NumericEx  = $Numeric ($Extend | $Format)*;
$ATermEx    = $ATerm   ($Extend | $Format)*;
$STermEx    = $STerm   ($Extend | $Format)*;
$CloseEx    = $Close   ($Extend | $Format)*;
# Abbreviations: words such as Mr. or George W. Bush should not break
# a sentence. An abbreviation is defined as an uppercase alpha char 
# followed by {0,3} lowercase alphas followed by a period
$AbbrevWord = ($UpperEx | ($UpperEx $LowerEx) | \
  ($UpperEx $LowerEx{2}) | ($UpperEx $LowerEx{3})) $ATerm;

!!chain;

!!forward;
# Rule 3 - break after separators. Keep CR/LF together.
$CR $LF {100};
# Rule 4 - Break after paragraph separator $Sep
# Rule 5 - Ignore $Format and $Extend
[^$Sep]? ($Extend | $Format)* {101};
# Rule 6 - Don't break after ambiguous terminator if its immediately
# followed by a number or lowercase letter.
# Replaced this condition to include optional space between the ambiguous
# terminator and number.
#$ATermEx $NumericEx {102};
$ATermEx ($SpEx)? $NumericEx {102};
# Rule 7 - Don't break if ambiguous terminator $ATerm is between two
# uppercase letters
$UpperEx $ATermEx $UpperEx {103};
# Rule 8 - Don't break if ambiguous terminator followed by other 
# continuation punctuation such as comma, colon, semicolon, etc.
$NotLettersEx = [^$OLetter $Upper $Lower $Sep $ATerm $STerm] \
  ($Extend | $Format)* {104};
$ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower {105};
# Rule added for recognizing $AbbrevWord
$AbbrevWord $CloseEx* $SpEx* $NotLettersEx* \
  ($Lower | $Upper | $NumericEx) {110};
# Rule 8a
($STermEx | $ATermEx) $CloseEx* $SpEx* ($STermEx | $ATermEx) {106};
# Rule 9, 10, 11 - Break after sentence terminator, but include closing 
# punctuation, trailing spaces and paragraph separator (if present).
# NOTE: the above sentence rules have been modified for use for paragraph
# tokenization. Specifically, $STerm (sentence terminator) and $ATerm
# (ambiguous terminator) have been removed from the match for figuring
# out paragraph breaks.
$CloseEx* $SpEx* $Sep? {107};
[[^$Close $Sp $Sep $Format $Extend]{bof}] \
  ($Extend | $Format | $Close | $Sp)* . {108};
[[^$Close $Sp $Sep $Format $Extend]{bof}] \
  ($Extend | $Format | $Close | $Sp)* ([$Sep{eof}] | $CR $LF) {109};

!!reverse;
$SpEx_R       = ($Extend | $Format)* $Sp;
$CloseEx_R    = ($Extend | $Format)* $Close;
# See modification in !!forward rule to make the sentence rules a bit
# more lenient to allow paragraph tokenization.
[{bof}] (.? | $LF $CR) [^$Sep]* [$Sep {eof}] ($SpEx_R* $CloseEx_R*)*;

!!safe_forward;

!!safe_reverse;

And here is the code for ParagraphTokenizer. It is quite similar to the SentenceTokenizer, the only real difference is that it is created using a different rule file and that it exposes a nextParagraph() method instead of a nextSentence() method.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
// Source: src/main/java/com/mycompany/myapp/tokenizers/ParagraphTokenizer.java
package com.mycompany.myapp.tokenizers;

import java.io.File;
import java.text.BreakIterator;

import org.apache.commons.io.FileUtils;

import com.ibm.icu.text.RuleBasedBreakIterator;

/**
 * Tokenizes the input into paragraphs, using ICU4J's rule-based break
 * iterator. Rule file is adapted from the rule file used internally by
 * the RBBI sentence tokenizer.
 */
public class ParagraphTokenizer {

  private String text;
  private int index = 0;
  private RuleBasedBreakIterator breakIterator;
  
  public ParagraphTokenizer() throws Exception {
    super();
    this.breakIterator = new RuleBasedBreakIterator(
      FileUtils.readFileToString(
      new File("src/main/resources/paragraph_break_rules.txt"), "UTF-8"));
  }
  
  public void setText(String text) {
    this.text = text;
    this.breakIterator.setText(text);
    this.index = 0;
  }
  
  public String nextParagraph() {
    int end = breakIterator.next();
    if (end == BreakIterator.DONE) {
      return null;
    }
    String sentence = text.substring(index, end);
    index = end;
    return sentence;
  }
}

Some sample results

My test case has code to run the C4J summarizer as well as my LuceneSummarizer. I used OTS as well on the command line and pulled out the first 2 sentences of the summary. Here is the code for the JUnit test. It tries to summarize, using C4J and then this LuceneSummarizer, three files from different Internet sources - I have manually cut and pasted the text content of these pages into local files, so I don't have to worry about HTML parsing for now.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
// Source: src/test/java/com/mycompany/myapp/summarizers/SummarizerTest.java
package com.mycompany.myapp.summarizers;

import java.io.File;

import net.sf.classifier4J.summariser.ISummariser;
import net.sf.classifier4J.summariser.SimpleSummariser;

import org.apache.commons.io.FileUtils;
import org.junit.Test;

public class SummarizerTest {

  private static String[] TEST_FILES = {
    "src/test/resources/data/sujitpal-actors.txt",
    "src/test/resources/data/nytimes-obama.txt",
    "src/test/resources/data/resample-nb.txt"
  };
  
  @Test
  public void testC4JSummarizer() throws Exception {
    for (String testFile : TEST_FILES) {
      String text = FileUtils.readFileToString(new File(testFile), "UTF-8");
      ISummariser summarizer = new SimpleSummariser();
      System.out.println("Input: " + testFile);
      String summary = summarizer.summarise(text, 2);
      // replace newlines with ellipses
      summary = summary.replaceAll("\n+", "...");
      System.out.println(">>> Summary (from C4J): " + summary);
    }
  }

  @Test
  public void testLuceneSummarizer() throws Exception {
    for (String testFile : TEST_FILES) {
      String text = FileUtils.readFileToString(new File(testFile), "UTF-8");
      LuceneSummarizer summarizer = new LuceneSummarizer();
      summarizer.setAnalyzer(new SummaryAnalyzer());
      summarizer.setNumSentences(2);
      summarizer.setTopTermCutoff(0.5F);
      summarizer.setSentenceDeboost(0.2F);
      summarizer.init();
      System.out.println("Input: " + testFile);
      String summary = summarizer.summarize(text);
      System.out.println(
        ">>> Summary (from LuceneSummarizer): " + summary);
    }
  }
}

For testing, I used the text (content cut-n-pasted from the web page to local disk) of three web pages. Results are shown below.

Original OTS C4J LuceneSummarizer
One of my own posts Over the past few weeks, I've been looking at various (Java and Scala based) Actor frameworks. Taking the println() calls out from both the Scala and the Jetlang examples improved the performance of both significantly, and the Jetlang example ended up with lower elapsed time numbers than the Scala examples. ... Over the past few weeks, I've been looking at various (Java and Scala based) Actor frameworks. ...Mike was kind enough to take a look at the Jetlang code, and he suggested that the excessive amounts of console IO that the actors were making were causing it to perform worse than Scala. Mike was kind enough to take a look at the Jetlang code, and he suggested that the excessive amounts of console IO that the actors were making were causing it to perform worse than Scala. ... This week, I provide the updated code for Kilim and Jetlang, and code to work with Actor's Guild and ActorFoundry, and provide the elapsed time comparison between these frameworks (as well as the Scala examples from last week).
News Page from New York Times WASHINGTON — Despite the huge sums the federal government is spending to aid banks and stimulate the economy, President Obama said on Monday that his administration will slash the federal budget deficit, in part by ending the “casual dishonesty” that has accompanied Washington budgets of late. One such bad habit has been “the casual dishonesty of hiding irresponsible spending,” Mr. Obama said, citing the Bush administration’s technique of “budgeting zero dollars for the Iraq war — zero — for future years, even when we knew the war would continue.” ... WASHINGTON — Despite the huge sums the federal government is spending to aid banks and stimulate the economy, President Obama said on Monday that his administration will slash the federal budget deficit, in part by ending the “casual dishonesty” that has accompanied Washington budgets of late. ...The president said that the bank-rescue plan and the broader economic stimulus program are necessary not merely to jolt the economy but because the country has “long-term challenges — health care, energy, education and others — that we can no longer afford to ignore.”...“But I want to be very clear,” he said at a White House economic conference involving legislators, business and labor leaders and others. WASHINGTON — Despite the huge sums the federal government is spending to aid banks and stimulate the economy, President Obama said on Monday that his administration will slash the federal budget deficit, in part by ending the “casual dishonesty” that has accompanied Washington budgets of late. ... The deficit is the year-by-year gap between what the federal government spends and the revenue it takes in.
Page Explaining Naive Bayes For classification, we want to determine P (H|X) -- the probability that the hypothesis H holds, given the observed data record X. For example, the probability that a fruit is an apple, given the condition that it is red and round. Suppose your data consist of fruits, described by their color and shape. However, bias in estimating probabilities often may not make a difference in practice -- it is the order of the probabilities, not their exact values, that determine the classifications. P (H|X) is the posterior probability of H conditioned on X. For example, the probability that a fruit is an apple, given the condition that it is red and round. ... P(X) is the prior probability of X, i.e., it is the probability that a data record from our set of fruits is red and round.

Summarizing HTML pages

So far, I have been deliberately dealing with plain text pages in order to keep things simple. However, the use case that prompted this work has to do with generating summaries for HTML pages. In the past, I have built a HTML parser using Jericho as part of another simple-minded summarizer that pulled web-page metadata to generate summaries, which I plan to extend and reuse.

Since I am now going to parse the body, I would like to skip over the H[1-6], SCRIPT, STYLE and TABLE tags at the minimum in order to keep the summaries clean. Jericho allows you to build a TextExtractor object from your Source, which can be configured with various tags to exclude from the body, something like this:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
    TextExtractor extractor = new TextExtractor(source) {
      public boolean excludeElement(StartTag startTag) {
        return 
          startTag.getName() == HTMLElementName.HEAD || 
          startTag.getName() == HTMLElementName.H1 || 
          startTag.getName() == HTMLElementName.H2 ||
          startTag.getName() == HTMLElementName.H3 ||
          startTag.getName() == HTMLElementName.H4 ||
          startTag.getName() == HTMLElementName.H5 ||
          startTag.getName() == HTMLElementName.H6 ||
          startTag.getName() == HTMLElementName.SCRIPT ||
          startTag.getName() == HTMLElementName.STYLE ||
          startTag.getName() == HTMLElementName.TABLE;
      }
    };
    extractor.setConvertNonBreakingSpaces(false);
    extractor.setExcludeNonHTMLElements(true);
    extractor.setIncludeAttributes(false);
    String text = extractor.toString();
    ...

Conclusion

My Lucene Summarizer seems to work decently, but I personally like the results from OTS and C4J better. However, this may be somewhat subjective, and could be the result of my own bias. I think the theory behind the application is sound, but perhaps I need to either play with the tuning parameters a bit more, or add extra heuristics to make the results better. If you have done similar stuff before, I would love to hear suggestions about how this can be improved.

Update 2009-04-26: In recent posts, I have been building on code written and described in previous posts, so there were (and rightly so) quite a few requests for the code. So I've created a project on Sourceforge to host the code. You will find the complete source code built so far in the project's SVN repository.

Saturday, February 21, 2009

Python SQL Runner

Recently, I have been working quite a bit with our taxonomy database, using the data in it as input to some data mining programs I am working on. Initially, I built in the database call into the programs themselves, but this approach turned out to be inconvenient for a number of reasons. One of these reasons is that the program is harder to restart in case of failures. Having these programs work with input files instead of a database query also results in cleaner and more extensible design, since now they can work with inputs other than SQL.

The databases I work with are MySQL and Oracle. The last time I needed this sort of stuff, I would whip up a Python script on-demand using either the MySQLdb or cx-oracle packages. One crashed hard disk and a couple of OS re-installs later, these scripts no longer work because the packages need to be re-installed as well. I have mostly gotten by since the crash using the MyEclipse Database Explorer and cut-n-paste, since the datasets I was working with were smaller, but the size of the data (~ 100K plus rows) I am dealing with now are likely to cause Eclipse to clutch its throat and do a crash-n-burn, taking my other unsaved code along with it, so I needed something that I could use from the command line.

This time around, I planned to do things a bit smarter than I did in the past, so I thought of building a generic script that would dump out rows given any SQL, something similar to my fledgling attempt many, many years ago. Funny how things come full circle from time to time. This time around, I also wanted to make sure its usage survives disk crashes and OS re-installs, so rather than have a script that requires extra dependencies, I wanted to use the JDBC drivers that I was going to install anyway. So my choices boiled down to either Jython or Scala.

I ended up writing it using Jython because my other scripts are all written in Python, and because writing database code in Python is slightly less verbose than in Java (or Scala, since that would have been the same thing). The name of the script is db2csv.py, and the code is shown below:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env /opt/jython-2.2.1/jython
import sys
import getopt
import os.path
import traceback
from com.ziclix.python.sql import zxJDBC

class SqlRunner:
  """
  Class to run the SQL and print it out into the output file as a CSV
  """
  def __init__(self, dbconfig, sqlfile, outputfile, sepchar):
    """
    @param dbconfig the database configuration file name
    @param sqlfile the name of the file containing the SQL to be executed
    @param outputfile the name of the file where output will be written
    @param sepchar the separator character (or string) to use
    """
    self.dbconfig = dbconfig
    self.sqlfile = sqlfile
    self.outputfile = outputfile
    self.sepchar = sepchar

  def getDbProps(self):
    """
    Return the database properties as a map of name value pairs.
    @return a map of name value pairs
    """
    if (not os.path.exists(self.dbconfig)):
      raise Exception("File not found: %s" % (self.dbconfig))
    props = {}
    pfile = open(self.dbconfig, 'rb')
    for pline in pfile:
      (name, value) = pline[:-1].split("=")
      props[name] = value
    pfile.close()
    return props

  def getSql(self):
    """
    Validate the sql file name, and parse out the SQL to be run. The
    method will skip SQL single-line comments. Blocks enclosed by free
    standing multi-line comments are also skipped.
    @return the SQL as a string
    """
    if (not os.path.exists(self.sqlfile)):
      raise Exception("File not found: %s" % (self.sqlfile))
    sql = []
    sfile = open(self.sqlfile, 'rb')
    incomment = False
    for sline in sfile:
      sline = sline.rstrip('\n')
      if (sline.startswith("--") or len(sline.rstrip()) == 0):
        # SQL Comment line, skip
        continue
      if (sline.rstrip() == "/*"):
        # start of SQL comment block
        incomment = True
        continue
      if (sline.rstrip() == "*/"):
        # end of SQL comment block
        incomment = False
        continue
      if (not incomment):
        sql.append(sline)
    sfile.close()
    return " ".join(sql)

  def runSql(self):
    """
    Runs the SQL and prints it out into the specified output file as a CSV
    file delimited by sepchar.
    """
    props = self.getDbProps()
    sql = self.getSql()
    print "Running SQL: %s" % (sql)
    ofile = open(self.outputfile, 'wb')
    db = zxJDBC.connect(props["url"], props["user"], props["password"],
      props["driver"])
    cur = db.cursor(True)
    cur.execute(sql)
    # print the header
    meta = cur.description
    print "Writing output to: %s" % (self.outputfile)
    ofile.write(self.sepchar.join(map(lambda x: x[0], meta)) + "\n")
    for row in cur.fetchall():
      strrow = map(lambda x: str(x), row)
      ofile.write(self.sepchar.join(strrow) + "\n")
    ofile.close()
    cur.close()
    db.close()
    
def usage(error=""):
  """
  Print the usage information. If an error message is supplied, print that
  on top of the usage information.
  """
  if (len(str(error)) > 0):
    print "ERROR: %s" % (error)
    print "STACK TRACE:"
    traceback.print_exc()
  print "USAGE:"
  print "%s -d dbconfig -q queryfile -s sepchar -o outputfile" % (sys.argv[0])
  print "OR: %s -h" % (sys.argv[0])
  print "OPTIONS:"
  print "--dbconfig | -d  : database configuration file"
  print "  configuration file must be in properties format, with the following"
  print "  keys defined: driver, url, user and password"
  print "--queryfile | -q : name of file containing SQL to be run"
  print "--outputfile | -o: name of file where results should be written"
  print "--sep | -s       : the separator character to use in output"
  print "--help | -h      : print this information"
  sys.exit(2)

def extractOptions(argv):
  """
  Extract command line options and return a tuple
  @param argv the sys.argv object
  @return a tuple containing the information for running the SQL
  """
  try:
    (opts, args) = getopt.getopt(argv[1:], "d:q:s:o:h",
      ["dbconfig=", "queryfile=", "sep=", "outputfile=", "help"])
  except getopt.GetoptError:
    usage()
  if (len(filter(lambda x: x[0] in ("-h", "--help"), opts)) == 1):
    usage()
  if (len(opts) != 4):
    usage()
  for opt in opts:
    (key, value) = opt
    if (key in ("-d", "--dbconfig")):
      dbconfig = value
    elif (key in ("-q", "--queryfile")):
      sqlfile = value
    elif (key in ("-o", "--outputfile")):
      outputfile = value
    elif (key in ("-s", "--sep")):
      sepchar = value
    else:
      usage()
  return (dbconfig, sqlfile, outputfile, sepchar)
  
def main():
  """
  This is how we are called
  """
  (dbconfig, sqlfile, outputfile, sepchar) = extractOptions(sys.argv)
  sqlrunner = SqlRunner(dbconfig, sqlfile, outputfile, sepchar)
  try:
    sqlrunner.runSql()
  except Exception, e:
    usage(e)

if __name__ == "__main__":
  main()

As you can see, there is nothing really new here, you can get most of this from the relevant section of the Jython User Guide. I did start to use a bit of Python lambdas, which ironically, I learned on my recent foray into Scala-land.

In the spirit of not having to install anything but the base language, the zJDBC package comes standard with the Jython version (2.2.1) that I am using. Since I already use Java I will install it anyway, as well JDBC drivers for the different databases that I will talk to.

You can call the script using command line parameters as shown in the help output below:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
sujit@sirocco:~$ ./db2csv.py -h
USAGE:
./db2csv.py -d dbconfig -q queryfile -s sepchar -o outputfile
OR: ./db2csv.py -h
OPTIONS:
--dbconfig | -d  : database configuration file
  configuration file must be in properties format, with the following
  keys defined: driver, url, user and password
--queryfile | -q : name of file containing SQL to be run
--outputfile | -o: name of file where results should be written
--sep | -s       : the separator character to use in output
--help | -h      : print this information

I had described in a previous post how one can add JAR files to the Jython classpath by appending the path names to sys.path in the code, but the approach didn't work for me here, perhaps because the appropriate Driver JAR class is being loaded explicitly by the code here. So I fell back to having it be loaded from the Java CLASSPATH instead. It's not such a bad thing, though - with this approach, one can use the script as is, for any database, as long as the JDBC driver exists on the CLASSPATH when the script is invoked. And if the driver is not in the CLASSPATH, the script tells you:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
sujit@sirocco:~$ ./db2csv.py -d /path/to/config/file -q /path/to/sql/file.sql \
  -o /path/to/report/file -s "|"
Running SQL: select foo from bar where baz = 'baz'
ERROR: driver [com.mysql.jdbc.Driver] not found
STACK TRACE:
Traceback (most recent call last):
  File "./db2csv.py", line 156, in main
    sqlrunner.runSql()
  File "./db2csv.py", line 83, in runSql
    db = zxJDBC.connect(props["url"], props["user"], props["password"],
DatabaseError: driver [com.mysql.jdbc.Driver] not found
USAGE:
./db2csv.py -d dbconfig -q queryfile -s sepchar -o outputfile
OR: ./db2csv.py -h
OPTIONS:
--dbconfig | -d  : database configuration file
  configuration file must be in properties format, with the following
  keys defined: driver, url, user and password
--queryfile | -q : name of file containing SQL to be run
--outputfile | -o: name of file where results should be written
--sep | -s       : the separator character to use in output
--help | -h      : print this information

I also had to increase the JVM heap size because Jython was running out of heap space when running the queries. I did this directly in the Jython script, by adding -Xmx2048m to the java call.

In spite of the simplicity of the script, I am already finding it immensely useful. If you have reached here looking for a similar script, I hope you will find it useful as well.

Monday, February 16, 2009

Blog Beautification with Blogger GData API

Last week, I wrote about how I had to disable the salmon/orange background in my <pre> tags so that output of the Pygments syntax colorizer rendered nicely. So that left all but couple of my blogs displaying code on a plain white background, which was not what I intended when I set out on my blog beautification project. I don't have that many posts (about 150 over the last 3 years), but it is still too many to go back and convert manually (and still retain my sanity). So I figured that this would be a good chance to check out the Google Blogger API.

Since Pygments is a Python project, I decided to write my conversion script in Python. The script works in three phases. Phase 1 is the download phase, where all the posts are downloaded at once on the local disk. Phase 2 consists of the extraction of the <pre> blocks, colorizing using Pygments, writing out an HTML page for manual review, as well as writing out the colorized version of the post onto local disk. The third phase is to actually upload the colorized post back to Blogger. I realize that its nicer conceptually to be able to do this in one fell swoop, but this would have not worked for me, as I elaborate below.

Here is the full Python code. I started out not knowing about the existence of the Python GData Blogger API, so my first version used httplib for network IO and libxml2 for XML processing. I chose libxml2 for its XSLT support, but the API is a bit of a mess, so if I were to do XML processing in the future, I would probably choose minidom or elementtree instead. The Python gdata module uses httplib and elementtree under the covers, and (obviously) results in much shorter application code.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
#!/usr/bin/python
# Source: bloggerclient.py
from MyPygmentLexers import *
import gdata
from gdata import service
from gdata import atom
from pygments.formatters import HtmlFormatter
from pygments import highlight
from pygments.lexers import *
from xml.sax.saxutils import unescape
import os
import os.path
import re
import shutil
import sys

# Completely dependent on my setup. Change this stuff to suit your own.
DATA_DIR = "/path/to/temporary/data/directory"
BLOGGER_EMAIL = "your_email@your_domain.com"
BLOGGER_PASSWD = "your-blogger-password"
PUBDATE_MIN = "2005-01-01T00:00:00"
PUBDATE_MAX = "2009-12-31T00:00:00"

# A map of string to lexer instances I need to colorize my blogs so far, you
# may need your own set if your language preferences are different
LEXER_MAPPINGS = {
  "java" : JavaLexer(),
  "xml" : XmlLexer(),
# TODO: Change back when bug 392 is fixed:
# http://dev.pocoo.org/projects/pygments/ticket/392
#  "scala" : ScalaLexer(),
  "scala" : JavaLexer(),
  "python" : PythonLexer(),
  "py" : PythonLexer(),
  "jython" : PythonLexer(),
  "php" : PhpLexer(),
  "lisp" : CommonLispLexer(),
  "cs" : CSharpLexer(),
  "unix" : UnixConsoleLexer(),
  "bash" : BashLexer(),
  "sh" : BashLexer(),
  "mysql" : MySqlLexer(),
  "javascript" : JavascriptLexer(),
  "css" : CssLexer(),
  "jsp" : JspLexer(),
  "html" : HtmlLexer(),
  "properties" : IniLexer(),
  "diff" : DiffLexer(),
  "gp" : GnuplotLexer(),
  "rb" : RubyLexer(),
  "lucli" : LucliLexer(),
  "text" : TextLexer(),
  "none" : None
}
LEXER_NAMES = sorted(LEXER_MAPPINGS.keys(), key=str)
HTML_FORMATTER = HtmlFormatter(styel='emacs', linenos='table')

### ============= GData ================

def authenticate():
  """
  Uses the ClientLogin approach to log into Blogger. Once this is called
  subsequent requests are automatically authenticated.
  @return a reference to the blogger service
  """
  blogger = service.GDataService(BLOGGER_EMAIL, BLOGGER_PASSWD)
  blogger.source = 'your-useragent-here-0.1'
  blogger.service = 'blogger'
  blogger.account_type = 'GOOGLE'
  blogger.server = 'www.blogger.com'
  blogger.ProgrammaticLogin()
  return blogger

def getBlogIds(blogger, userId='default'):
  """
  Retrieves the blog metadata from blogger and returns the blogIds
  for the given userId.
  @param blogger the reference to an authenticated service
  @param userId default value 'default' if not supplied
  @return a List of blogIds
  """
  query = service.Query()
  query.feed = '/feeds/%s/blogs' % (userId)
  feed = blogger.Get(query.ToUri())
  blogIds = []
  for entry in feed.entry:
    blogIds.append(entry.id.text.split("-")[-1])
  return blogIds

def getBlogEntries(blogger, blogId, pubMin=PUBDATE_MIN, pubMax=PUBDATE_MAX):
  """
  Returns all posts from PUBLISHED_DATE_MIN to PUBLISHED_DATE_MAX for the
  specified blogId.
  @param blogger the reference to the authenticated service
  @param blogId the id of the blog
  @param pubMin the minimum publish date to retrieve
  @param pubMax the maximum publish date to retrieve
  @return a List of entry objects
  """
  query = service.Query()
  query.feed = '/feeds/%s/posts/default' % (blogId)
  query.published_min = pubMin
  query.published_max = pubMax
  query.max_results = 1000
  feed = blogger.Get(query.ToUri())
  entries = []
  for entry in feed.entry:
    entries.append(entry)
  return entries

def minAfter(ts):
  """
  Returns a time stamp which represents one minute after specified timestamp.
  A timestamp looks like this: 2005-01-01T00:00:00
  @param ts the specified timestamp.
  @return the timestamp representing a minute after ts.
  """
  parts = ts.split(":")
  nextMin = str(int(parts[1]) + 1).zfill(2)
  parts[1] = nextMin
  print ":".join(parts)
  return ":".join(parts)

### ============== Pygments ==================

def askLexer(text):
  """
  Display the text on console and ask the user what it is. Must be one
  of the patterns in LEXER_MAPPINGS.
  @param text the text to display to user
  @return the Lexer instance from LEXER_MAPPINGS
  """
  print '==== text ===='
  print text
  print '==== /text ===='
  while 1:
    ctype = raw_input("Specify type (" + ", ".join(LEXER_NAMES) + "): ")
    try:
      lexer = LEXER_MAPPINGS[ctype]
      break
    except KeyError:
      print 'Sorry, invalid type, try again'
  return lexer

def guessLexer(text):
  """
  Uses the file name metadata or shebang info on first line if it exists
  and try to "guess" the lexer that is required for colorizing.
  @param text the text to analyze
  @return the Lexer instance from LEXER_MAPPINGS
  """
  firstline = text[0:text.find("\n")]
  match = re.search(r'[/|.]([a-zA-Z]+)$', firstline)
  if (match):
    guess = match.group(1)
    try:
      return LEXER_MAPPINGS[guess]
    except KeyError:
      return askLexer(text)
  else:
    return askLexer(text)

def colorize(text, lexer):
  """
  Calls the pygments API to colorize text with appropriate defaults (for my
  use) for inclusion a HTML page.
  @param text the input text
  @lexer the Lexer to use
  @return the colorized text
  """
  if (lexer == None):
    return "\n".join(["<pre class=\"hll\">", text, "</pre>"])
  else:
    return highlight(text, lexer, HTML_FORMATTER)

### ====================== Local IO =================

def createPreview(downloadFilename, previewFilename):
  """
  Reads the download file body and colorizes it as applicable. Writes it out
  to a HTML file (wrapped in a stylesheet, etc) so it can be previewed. If
  there is text in the downloadFile that can be colorized, then returns true
  else returns false.
  @param downloadFilename the name of the download file.
  @param previewFilename the name of the preview file.
  @return True if processing happened, else False.
  """
  previewFile = open(previewFilename, 'wb')
  previewFile.write("""
<html><head><style>
.linenos {background-color: #cccccc }
.hll { background-color: #ffffcc }
.c { color: #008800; font-style: italic } /* Comment */
.err { border: 1px solid #FF0000 } /* Error */
.k { color: #AA22FF; font-weight: bold } /* Keyword */
.o { color: #666666 } /* Operator */
.cm { color: #008800; font-style: italic } /* Comment.Multiline */
.cp { color: #008800 } /* Comment.Preproc */
.c1 { color: #008800; font-style: italic } /* Comment.Single */
.cs { color: #008800; font-weight: bold } /* Comment.Special */
.gd { color: #A00000 } /* Generic.Deleted */
.ge { font-style: italic } /* Generic.Emph */
.gr { color: #FF0000 } /* Generic.Error */
.gh { color: #000080; font-weight: bold } /* Generic.Heading */
.gi { color: #00A000 } /* Generic.Inserted */
.go { color: #808080 } /* Generic.Output */
.gp { color: #000080; font-weight: bold } /* Generic.Prompt */
.gs { font-weight: bold } /* Generic.Strong */
.gu { color: #800080; font-weight: bold } /* Generic.Subheading */
.gt { color: #0040D0 } /* Generic.Traceback */
.kc { color: #AA22FF; font-weight: bold } /* Keyword.Constant */
.kd { color: #AA22FF; font-weight: bold } /* Keyword.Declaration */
.kn { color: #AA22FF; font-weight: bold } /* Keyword.Namespace */
.kp { color: #AA22FF } /* Keyword.Pseudo */
.kr { color: #AA22FF; font-weight: bold } /* Keyword.Reserved */
.kt { color: #00BB00; font-weight: bold } /* Keyword.Type */
.m { color: #666666 } /* Literal.Number */
.s { color: #BB4444 } /* Literal.String */
.na { color: #BB4444 } /* Name.Attribute */
.nb { color: #AA22FF } /* Name.Builtin */
.nc { color: #0000FF } /* Name.Class */
.no { color: #880000 } /* Name.Constant */
.nd { color: #AA22FF } /* Name.Decorator */
.ni { color: #999999; font-weight: bold } /* Name.Entity */
.ne { color: #D2413A; font-weight: bold } /* Name.Exception */
.nf { color: #00A000 } /* Name.Function */
.nl { color: #A0A000 } /* Name.Label */
.nn { color: #0000FF; font-weight: bold } /* Name.Namespace */
.nt { color: #008000; font-weight: bold } /* Name.Tag */
.nv { color: #B8860B } /* Name.Variable */
.ow { color: #AA22FF; font-weight: bold } /* Operator.Word */
.w { color: #bbbbbb } /* Text.Whitespace */
.mf { color: #666666 } /* Literal.Number.Float */
.mh { color: #666666 } /* Literal.Number.Hex */
.mi { color: #666666 } /* Literal.Number.Integer */
.mo { color: #666666 } /* Literal.Number.Oct */
.sb { color: #BB4444 } /* Literal.String.Backtick */
.sc { color: #BB4444 } /* Literal.String.Char */
.sd { color: #BB4444; font-style: italic } /* Literal.String.Doc */
.s2 { color: #BB4444 } /* Literal.String.Double */
.se { color: #BB6622; font-weight: bold } /* Literal.String.Escape */
.sh { color: #BB4444 } /* Literal.String.Heredoc */
.si { color: #BB6688; font-weight: bold } /* Literal.String.Interpol */
.sx { color: #008000 } /* Literal.String.Other */
.sr { color: #BB6688 } /* Literal.String.Regex */
.s1 { color: #BB4444 } /* Literal.String.Single */
.ss { color: #B8860B } /* Literal.String.Symbol */
.bp { color: #AA22FF } /* Name.Builtin.Pseudo */
.vc { color: #B8860B } /* Name.Variable.Class */
.vg { color: #B8860B } /* Name.Variable.Global */
.vi { color: #B8860B } /* Name.Variable.Instance */
.il { color: #666666 } /* Literal.Number.Integer.Long */
</style></head><body>
    """)
  downloadFile = open(downloadFilename, 'rb')
  pres = []
  inPreBlock = False
  processed = False
  for line in downloadFile:
    line = line[:-1]
    if (line == "<pre>"):
      inPreBlock = True
      processed = True
    elif (line == "</pre>"):
      pretext = unescape("\n".join(pres))
      colorized = colorize(pretext, guessLexer(pretext))
      previewFile.write(colorized + "\n")
      pres = []
      inPreBlock = False
    else:
      if (inPreBlock):
        pres.append(line)
      else:
        previewFile.write(line + "\n")
  previewFile.write("""
</body></html>
    """)
  downloadFile.close()
  previewFile.close()
  return processed

def createUpload(previewFilename, uploadFilename):
  """
  Replace the body of the inputXml file with the body of the preview HTML
  file and produce the outputXml file suitable for uploading to Blogger site.
  @param previewFilename the name of the preview file.
  @param uploadFilename the name of the upload file.
  """
  previewFile = open(previewFilename, 'rb')
  uploadFile = open(uploadFilename, 'wb')
  inBody = False
  for line in previewFile.readlines():
    line = line[:-1]
    if (line.endswith("<body>")):
      inBody = True
    elif (line.startswith("</body>")):
      inBody = False
    else:
      if (inBody):
        uploadFile.write(line + "\n")
  previewFile.close()
  uploadFile.close()

def getCatalogDirs():
  """
  Returns all directories under DATA_DIR where catalog.txt directory is found
  @return a List of directory names.
  """
  catalogDirs = []
  def callback(arg, directory, files):
    for file in files:
      if (file == arg):
        catalogDirs.append(directory)
  os.path.walk(DATA_DIR, callback, "catalog.txt")
  return catalogDirs

### ============== called from command line ========================

def usage():
  """ Prints the usage """
  print "Usage: %s download|upload|preview|clean" % (sys.argv[0])
  print "\tdownload -- download remote blog(s) into local directory"
  print "\tupload -- upload colorized post(s) back to blog"
  print "\tpreview -- build html for local preview before upload"
  print "\tclean -- clean up the data directory"
  sys.exit(-1)

def clean():
  """ Clean up the data directory for a new run """
  if (os.path.exists(DATA_DIR)):
    yorn = raw_input("Deleting directory: %s. Proceed (y/n)? " % (DATA_DIR))
    if (yorn == 'y'):
      print "Deleting directory: %s" % (DATA_DIR)
      shutil.rmtree(DATA_DIR)

def download(blogger):
  """
  Downloads one or more blogs for the specified user. The posts are stored
  under ${DATA_DIR}/downloads/${blogId} as XML files. Each XML file contains
  the full atom entry element for a single post.
  @param blogger the reference to the authenticated blogger service
  """
  downloadDir = os.sep.join([DATA_DIR, "downloads"])
  if (not os.path.exists(downloadDir)):
    os.makedirs(downloadDir)
  blogIds = getBlogIds(blogger)
  for blogId in blogIds:
    downloadBlogDir = os.sep.join([downloadDir, blogId])
    if (not os.path.exists(downloadBlogDir)):
      os.makedirs(downloadBlogDir)
    catalog = open(os.sep.join([downloadBlogDir, "catalog.txt"]), 'wb')
    blogEntries = getBlogEntries(blogger, blogId)
    for blogEntry in blogEntries:
      id = blogEntry.id.text.split("-")[-1]
      title = blogEntry.title.text
      published = blogEntry.published.text
      publishUrl = blogEntry.GetEditLink().href
      catalog.write("|".join([id, published, publishUrl, title, "\n"]))
      print ">>> Retrieving [%s] to %s.txt" % (title, id)
      pfile = open(os.sep.join([downloadBlogDir, id + ".txt"]), 'wb')
      pfile.write(blogEntry.content.text)
      pfile.close()
    catalog.close()

def preview():
  """
  Runs through the downloaded XML files, extracts and colorizes the body, then
  wraps it into a HTML template for local viewing on a browser. Since this has
  a manual component (askLexer), the method checks to see if a preview file has
  already been created (so this can be run multiple times without deleting work
  done previously). If the preview generation is not successful (due to a code
  bug somewhere), then you need to manually delete the preview file.
  """
  catalogDirs = getCatalogDirs()
  for catalogDir in catalogDirs:
    blogId = catalogDir.split("/")[-1]
    catalog = open(os.path.join(catalogDir, "catalog.txt"), 'rb')
    for catline in catalog:
      catline = catline[:-1]
      id = catline.split("|")[0]
      previewDir = os.sep.join([DATA_DIR, "preview", blogId])
      if (not os.path.exists(previewDir)):
        os.makedirs(previewDir)
      uploadDir = os.sep.join([DATA_DIR, "uploads", blogId])
      if (not os.path.exists(uploadDir)):
        os.makedirs(uploadDir)
      downloadFile = os.path.join(catalogDir, id + ".txt")
      previewFile = os.path.join(previewDir, id + ".html")
      uploadFile = os.path.join(uploadDir, id + ".txt")
      if (os.path.exists(downloadFile) and os.path.exists(previewFile)):
        print ">>> Skipping file: %s, already processed" % (downloadFile)
        continue
      else:
        print ">>> Processing file: %s" % (downloadFile)
      processed = createPreview(downloadFile, previewFile)
      if (processed):
        createUpload(previewFile, uploadFile)
    catalog.close()
    
def upload(blogger):
  """
  Runs through the catalog file and extract data from the upload text
  file. We then get the blogger entry and update the body with the text
  from the upload file, then HTTP PUT it back to blogger.

  NOTE: This does not work at the moment, I have a question posted on
  the gdata-python-client-library-contributors list:
  http://groups.google.com/group/gdata-python-client-library-contributors/browse_thread/thread/8a7f6f94873921f1
  I finally used the Java client to do the upload.

  @param blogger a reference to the authenticated blogger service.
  """
  catalogDirs = getCatalogDirs()
  for catalogDir in catalogDirs:
    blogId = catalogDir.split("/")[-1]
    catalog = open(os.path.join(catalogDir, "catalog.txt"), 'rb')
    for catline in catalog:
      (id, pubdate, pubUrl, title, junk) = catline.split("|")
      # get data to upload
      uploadFilename = os.sep.join([DATA_DIR, "uploads", blogId, id + ".txt"])
      uploadFile = open(uploadFilename, 'rb')
      uploadData = uploadFile.read()[:-1]
      uploadFile.close()
      # retrieve entry
      entries = getBlogEntries(blogger, blogId, pubdate, minAfter(pubdate))
      if (len(entries) != 1):
        print "Too few or too many entries found for date range, upload skipped"
        return
      entry = entries[0]
      entry.content = atom.Content("html", uploadData)
      print entry
      print ">>> Uploading file: %s.txt" % (id)
      response = blogger.Put(entry, pubUrl)
      print response

def main():
  if (len(sys.argv) < 2):
    usage()
  if (not os.path.exists(DATA_DIR)):
    os.makedirs(DATA_DIR)
  if (sys.argv[1] == 'download' or sys.argv[1] == 'upload'):
    blogger = authenticate()
    if (sys.argv[1] == 'download'):
      download(blogger)
    else:
      upload(blogger)
    conn.close()
  elif (sys.argv[1] == 'preview' or sys.argv[1] == 'clean'):
    if (sys.argv[1] == 'preview'):
      preview()
    else:
      clean()
  else:
      usage()

if __name__ == "__main__":
  main()

The code above also uses some custom Pygment lexer classes I wrote, one for Lucli (described here) and another really simple one for Unix console commands. This class is shown below:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/bin/python
# Source: MyPygmentLexers.py
from pygments.lexer import RegexLexer, bygroups
from pygments.token import *

# All my custom Lexers here
class LucliLexer(RegexLexer):
  """
  Simple Lucli command line lexer based on RegexLexer. We just define various
  kinds of tokens for coloration in the tokens dictionary below.
  """
  name = 'Lucli'
  aliases = ['lucli']
  filenames = ['*.lucli']

  tokens = {
    'root' : [
      # our startup header
      (r'Lucene CLI.*$', Generic.Heading),
      # the prompt
      (r'lucli> ', Generic.Prompt),
      # keywords that appear by themselves in a single line
      (r'(analyzer|help|info|list|orient|quit|similarity|terms)$', Keyword),
      # keywords followed by arguments in single line
      (r'(analyzer|count|explain|get|index|info|list|optimize|'
       r'orient|quit|remove|search|similarity|terms|tokens)(\s+)(.*?)$',
       bygroups(Keyword, Text, Generic.Strong)),
      # rest of the text
      (r'.*\n', Text)
    ]
  }

class UnixConsoleLexer(RegexLexer):
  name = "UnixConsole"
  aliases = ['UnixConsole']
  filenames = ['*.sh']

  tokens = {
    'root' : [
      (r'.*?\$', Generic.Prompt),
      (r'.*\n', Text)
    ]
  }

To use the bloggerclient.py script, first update the block of globals beginning with DATA_DIR, etc, with your own values. The script takes a few action parameters, similar to an Ant script. To download all the posts, invoke the following command. This will download the posts, one to a file, as well as write a catalog file (catalog.txt) in your ${DATA_DIR}/downloads/${blogId} directory.

1
prompt$ ./bloggerclient.py download

To process the downloaded posts, invoke bloggerclient.py with preview. This will attempt to colorize (freestanding) <pre> blocks in the input text into colorized output with Pygment, using the LEXER_MAPPINGS to fire the appropriate Lexer for a given block of code. It attempts to figure out the programming language from the first line of the code (where I usually have a Source: comment), otherwise, it will display the block and ask you to choose the appropriate Lexer. The colorized code is written to an HTML file with the appropriate stylesheet inlined so you can look at it before deciding to upload it. It also writes out the colorized code so its ready for upload.

1
prompt$ ./bloggerclient.py preview

During the preview process, I discovered a bug in the Scala Lexer, which causes it to hang indefinitely, presumably within a regex lookup. I opened a bug for this. However, a quick workaround for this is to use the Java Lexer instead - it does most of what the Scala Lexer needs to do.

Finally, to upload the posts, the idea was to invoke bloggerclient.py with the upload option. However, I could not get that to work. I suspect that its either a bug in the GData module, since other people have noticed it also, or it could be something to do with my version of httplib, since I could not get my original version using httplib to HTTP PUT to Blogger either. I have posted my problem on the gdata-python-client-library-contributors list, we'll see what comes of that.

Since I just wanted to be done with this stuff, and because I already had the colorized versions of the posts on local disk at this stage, I decided to use the Java GData API to upload, which happily succeeded. Here is the code, written out in the form of JUnit tests so it can be run easily from the command line using mvn test.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
// Source: src/main/java/com/mycompany/blogger/client/GDataBloggerUploadTest.java
package com.mycompany.blogger.client;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.junit.Test;

import com.google.gdata.client.GoogleService;
import com.google.gdata.client.Query;
import com.google.gdata.data.Content;
import com.google.gdata.data.DateTime;
import com.google.gdata.data.Entry;
import com.google.gdata.data.Feed;
import com.google.gdata.data.HtmlTextConstruct;
import com.google.gdata.data.TextContent;

/**
 * Simple test case to upload locally updated blogger pages back to Blogger.
 */
public class GDataBloggerUploadTest {

  private static final String BLOGGER_EMAIL = "your_email@company.com";
  private static final String BLOGGER_PASSWD = "your_blogger_password";
  private static final String DOWNLOAD_DIR = "/path/to/download/dir";
  private static final String UPLOAD_DIR = "/path/to/upload/dir";
  private static final String FEED_URL = "http://blogger.feed.url/";
  private static final String BLOG_ID = "your_blog_id";
  
  private static final SimpleDateFormat TS_FORMATTER = new SimpleDateFormat(
      "yyyy-MM-dd'T'HH:mm:ss");
  
//  @Test
  public void testUploadByPubdate() throws Exception {
    GoogleService service = new GoogleService("blogger", "salmonrun-bloggerclient-j-0.1");
    // login
    service.setUserCredentials(BLOGGER_EMAIL, BLOGGER_PASSWD);
    // read catalog file
    BufferedReader catalogReader = new BufferedReader(new FileReader(
      DOWNLOAD_DIR + "/catalog.txt"));
    String catalogLine;
    // read through the catalog file for metadata
    while ((catalogLine = catalogReader.readLine()) != null) {
      String[] cols = StringUtils.split(catalogLine, "|");
      String id = cols[0];
      String pubDate = cols[1];
      String pubUrl = cols[2];
      String title = cols[3];
      // check to see if the file needs to be uploaded (if not available,
      // then it does not need to be uploaded).
      File uploadFile = new File(UPLOAD_DIR + "/" + id + ".txt");
      if (! uploadFile.exists()) {
        System.out.println("Skipping post (" + id + "): " + title + ", no changes");
        continue;
      }
      System.out.println("Uploading post (" + id + "): " + title);
      // suck out all the data into a data buffer
      BufferedReader uploadReader = new BufferedReader(new FileReader(
        UPLOAD_DIR + "/" + id + ".txt"));
      StringBuilder uploadDataBuffer = new StringBuilder();
      String uploadLine;
      while ((uploadLine = uploadReader.readLine()) != null) {
        uploadDataBuffer.append(uploadLine).append("\n");
      }
      uploadReader.close();
      // retrieve the post
      long pubMinAsLong = TS_FORMATTER.parse(pubDate).getTime();
      DateTime pubMin = new DateTime(pubMinAsLong);
      DateTime pubMax = new DateTime(pubMinAsLong + 3600000L); // 1 hour after
      URL feedUrl = new URL(FEED_URL);
      Query query = new Query(feedUrl);
      query.setPublishedMin(pubMin);
      query.setPublishedMax(pubMax);
      Feed result = service.query(query, Feed.class);
      List<Entry> entries = result.getEntries();
      if (entries.size() != 1) {
        System.out.println("Invalid number of entries: " + entries.size() + ", skip: " + id);
        continue;
      }
      Entry entry = entries.get(0);
      // then stick the updated content into the post
      entry.setContent(new TextContent(
        new HtmlTextConstruct(uploadDataBuffer.toString())));
      // then upload
      service.update(new URL(pubUrl), entry);
      // rename them so they are not picked up next time round
      uploadFile.renameTo(new File(UPLOAD_DIR + "/" + id + ".uploaded"));
    }
    catalogReader.close();
  }
  
//  @Test
  public void testUploadAll() throws Exception {
    GoogleService service = new GoogleService("blogger", "salmonrun-bloggerclient-j-0.1");
    // login
    service.setUserCredentials(BLOGGER_EMAIL, BLOGGER_PASSWD);
    // read catalog file
    BufferedReader catalogReader = new BufferedReader(new FileReader(
      DOWNLOAD_DIR + "/catalog.txt"));
    String catalogLine;
    // read through the catalog file for metadata, and build a set of 
    // entries to upload
    Set<String> ids = new HashSet<String>();
    while ((catalogLine = catalogReader.readLine()) != null) {
      String[] cols = StringUtils.split(catalogLine, "|");
      String id = cols[0];
      // check to see if the file needs to be uploaded (if not available,
      // then it does not need to be uploaded).
      File uploadFile = new File(UPLOAD_DIR + "/" + id + ".txt");
      if (! uploadFile.exists()) {
        continue;
      }
      ids.add("tag:blogger.com,1999:blog-" + BLOG_ID + ".post-" + id);
    }
    catalogReader.close();
    System.out.println("#-entries to upload: " + ids.size());
    // now get all the posts
    URL feedUrl = new URL(FEED_URL);
    Query query = new Query(feedUrl);
    query.setPublishedMin(new DateTime(TS_FORMATTER.parse("2005-01-01T00:00:00")));
    query.setPublishedMax(new DateTime(TS_FORMATTER.parse("2009-12-31T00:00:00")));
    query.setMaxResults(1000); // I just have about 150, so this will cover everything
    Feed result = service.query(query, Feed.class);
    List<Entry> entries = result.getEntries();
    for (Entry entry : entries) {
      String id = entry.getId();
      if (! ids.contains(id)) {
        continue;
      }
      String title = entry.getTitle().getPlainText();
      // get contents to update
      String fn = id.substring(id.lastIndexOf('-') + 1);
      System.out.println(">>> Uploading entry (" + id + "): [" + title + "] from file: " + 
        fn + ".txt");
      File uploadFile = new File(UPLOAD_DIR, fn + ".txt");
      if (! uploadFile.exists()) {
        System.out.println("Upload file does not exist: " + uploadFile.toString());
        continue;
      }
      String contents = FileUtils.readFileToString(uploadFile, "UTF-8");
      if (StringUtils.trim(contents).length() == 0) {
        System.out.println("Zero bytes for " + fn + ", skipping");
        continue;
      }
      // then stick the updated content into the post
      entry.setContent(new TextContent(
        new HtmlTextConstruct(contents)));
      String publishUrl = entry.getEditLink().getHref();
      // then upload
      service.update(new URL(publishUrl), entry);
    }
  }
  
  @Test
  public void testFindEmptyBlogs() throws Exception {
    GoogleService service = new GoogleService("blogger", "salmonrun-bloggerclient-j-0.1");
    // login
    service.setUserCredentials(BLOGGER_EMAIL, BLOGGER_PASSWD);
    // get all posts
    URL feedUrl = new URL(FEED_URL);
    Query query = new Query(feedUrl);
    query.setPublishedMin(new DateTime(TS_FORMATTER.parse("2005-01-01T00:00:00")));
    query.setPublishedMax(new DateTime(TS_FORMATTER.parse("2009-12-31T00:00:00")));
    query.setMaxResults(1000); // I just have about 150, so this will cover everything
    Feed result = service.query(query, Feed.class);
    List<Entry> entries = result.getEntries();
    for (Entry entry : entries) {
      String id = entry.getId();
      String title = entry.getTitle().getPlainText();
      String content = ((TextContent) entry.getContent()).getContent().getPlainText();
      if (StringUtils.trim(content).length() == 0) {
        String postId = id.substring(id.lastIndexOf('-') + 1);
        System.out.println(postId + " (" + title + ")");
      }
    }
  }
}

The testUploadByPubDate() tries the same approach as the the Python upload() function, downloading each post by publishedDate and trying to update. However, I found that some posts could not be retrieved using this strategy. I then tried the second approach shown in the testUploadAll(), which first downloads all the posts, then runs through them, applying updates to the ones that are not updated already. This resulted in several blogs just disappearing. Apparently, the upload did not go through completely, so I had to repeat them. The third test method testFindEmptyBlogs() was to figure out which ones to send for reprocessing.

Anyway, the Blog Beautification Project is over, at least for now. Hopefully the next time round it won't be so invasive. I hope you found the results visually appealing and this post itself interesting, at least as a case study of using the Blogger API.

In retrospect, the time I took to write the Python version using httplib and libxml2, convert to using the gdata module, then finally writing a Java version of the upload was probably about the same or more than it would have taken me to do the colorization manually, but it was much more fun. I haven't written much Python code lately, so it was a nice change to be able to use it again.

Sunday, February 08, 2009

Syntax Coloring and Pygments Lexer for Lucli

I have recently come to the realization that I have been doing you, my readers, a disservice. Over the last year, I have posted blogs which mostly consist of gobs and gobs of code served without any syntax coloration. My first experience with syntax coloration was with the vim editor, and happened accidentally, with an operating system upgrade. I remember how I could suddenly see the code so much more clearly, and how my productivity shot up over the next few weeks. So syntax coloration is a big deal, and I should have addressed it sooner.

Looking around at other people's blogs, I noticed that most of them used Alex Gorbatchev's SyntaxHighlighter. SyntaxHighlighter is written in Javascript and dynamically colorizes your code using CSS. I also liked how easy it was to create a colorizer (called a "brush") for a new language. I tried to set it up for my blog with the approach described here, but could not make it work to my satisfaction - specifically, the line numbers were appearing but no syntax coloration - and yes, I did include the links to the various brushes.

I looked around for alternatives, and found the Pygments project. The Pygments project is very comprehensive and at least at first glance, slightly more complex than SyntaxHighlighter. Pygments does have an AJAX/Javascript mode thanks to some folks at Objectgraph. However, I chose to go the command line route, where you pre-build the code into HTML and stick it in. The advantage of this approach is that you can build custom lexers for your own content, and the disadvantage is that your HTML becomes very hard to read and maintain - if you want to change some code, your best bet is to copy it from the rendered version and rerun it through your colorizing process.

Pygments can be used to colorize code and render into a variety of output formats - I am only interested in the HTML formatter. To colorize a block of code, it uses language specific Lexers, which output keyword, operator, etc, tokens. The formatter takes these tokens and wraps them in <span> tags. There is a CSS file which dictates the behavior of the classes named in the span tags, which results in the syntax coloring. So for a person in my situation, my main concern would be the availability of Lexers I need, or an API by which I can build a custom Lexer.

To be fair, Pygments has a huge number of Lexers available, many more compared to the brushes in the SyntaxHighlighter project, so may actually never have to build one yourself. However, I wanted to learn how to do this, and output from the Lucli console (see my previous blog) seemed to be a good candidate, so this post is about my experience building that.

I first generated the CSS to put into the header. You can do this by invoking the pygmentize script, as shown below:

1
prompt$ pygmentize -S emacs -f html >/tmp/styles.css

The resulting CSS file is put into the Template file in the <b:skin> block, along with the other CSS declarations. I also added the additional .linenos class in here to make the line number margin gray. Here is what my CSS block looks like.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
.linenos {background-color: #cccccc }
.hll { background-color: #ffffcc }
.c { color: #008800; font-style: italic } /* Comment */
.err { border: 1px solid #FF0000 } /* Error */
.k { color: #AA22FF; font-weight: bold } /* Keyword */
.o { color: #666666 } /* Operator */
.cm { color: #008800; font-style: italic } /* Comment.Multiline */
.cp { color: #008800 } /* Comment.Preproc */
.c1 { color: #008800; font-style: italic } /* Comment.Single */
.cs { color: #008800; font-weight: bold } /* Comment.Special */
.gd { color: #A00000 } /* Generic.Deleted */
.ge { font-style: italic } /* Generic.Emph */
.gr { color: #FF0000 } /* Generic.Error */
.gh { color: #000080; font-weight: bold } /* Generic.Heading */
.gi { color: #00A000 } /* Generic.Inserted */
.go { color: #808080 } /* Generic.Output */
.gp { color: #000080; font-weight: bold } /* Generic.Prompt */
.gs { font-weight: bold } /* Generic.Strong */
.gu { color: #800080; font-weight: bold } /* Generic.Subheading */
.gt { color: #0040D0 } /* Generic.Traceback */
.kc { color: #AA22FF; font-weight: bold } /* Keyword.Constant */
.kd { color: #AA22FF; font-weight: bold } /* Keyword.Declaration */
.kn { color: #AA22FF; font-weight: bold } /* Keyword.Namespace */
.kp { color: #AA22FF } /* Keyword.Pseudo */
.kr { color: #AA22FF; font-weight: bold } /* Keyword.Reserved */
.kt { color: #00BB00; font-weight: bold } /* Keyword.Type */
.m { color: #666666 } /* Literal.Number */
.s { color: #BB4444 } /* Literal.String */
.na { color: #BB4444 } /* Name.Attribute */
.nb { color: #AA22FF } /* Name.Builtin */
.nc { color: #0000FF } /* Name.Class */
.no { color: #880000 } /* Name.Constant */
.nd { color: #AA22FF } /* Name.Decorator */
.ni { color: #999999; font-weight: bold } /* Name.Entity */
.ne { color: #D2413A; font-weight: bold } /* Name.Exception */
.nf { color: #00A000 } /* Name.Function */
.nl { color: #A0A000 } /* Name.Label */
.nn { color: #0000FF; font-weight: bold } /* Name.Namespace */
.nt { color: #008000; font-weight: bold } /* Name.Tag */
.nv { color: #B8860B } /* Name.Variable */
.ow { color: #AA22FF; font-weight: bold } /* Operator.Word */
.w { color: #bbbbbb } /* Text.Whitespace */
.mf { color: #666666 } /* Literal.Number.Float */
.mh { color: #666666 } /* Literal.Number.Hex */
.mi { color: #666666 } /* Literal.Number.Integer */
.mo { color: #666666 } /* Literal.Number.Oct */
.sb { color: #BB4444 } /* Literal.String.Backtick */
.sc { color: #BB4444 } /* Literal.String.Char */
.sd { color: #BB4444; font-style: italic } /* Literal.String.Doc */
.s2 { color: #BB4444 } /* Literal.String.Double */
.se { color: #BB6622; font-weight: bold } /* Literal.String.Escape */
.sh { color: #BB4444 } /* Literal.String.Heredoc */
.si { color: #BB6688; font-weight: bold } /* Literal.String.Interpol */
.sx { color: #008000 } /* Literal.String.Other */
.sr { color: #BB6688 } /* Literal.String.Regex */
.s1 { color: #BB4444 } /* Literal.String.Single */
.ss { color: #B8860B } /* Literal.String.Symbol */
.bp { color: #AA22FF } /* Name.Builtin.Pseudo */
.vc { color: #B8860B } /* Name.Variable.Class */
.vg { color: #B8860B } /* Name.Variable.Global */
.vi { color: #B8860B } /* Name.Variable.Instance */
.il { color: #666666 } /* Literal.Number.Integer.Long */

For the Lucli command line, I wanted the lucli> prompt and the available commands to be highlighted in a different color, and somehow be able to distinguish the Lucli output from the user input, perhaps by bolding the user input. The LucliLexer class is quite simple, it extends the RegexLexer class and sets up the necessary regular expressions that will be matched to appropriate tokens. Here is the code.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#!/usr/bin/python
from pygments.lexer import RegexLexer, bygroups
from pygments.token import *

# All my custom Lexers here
class LucliLexer(RegexLexer):
  """
  Simple Lucli command line lexer based on RegexLexer. We just define various
  kinds of tokens for coloration in the tokens dictionary below.
  """
  name = 'Lucli'
  aliases = ['lucli']
  filenames = ['*.lucli']

  tokens = {
    'root' : [
      # our startup header
      (r'Lucene CLI.*$', Generic.Heading),
      # the prompt
      (r'lucli> ', Generic.Prompt),
      # keywords that appear by themselves in a single line
      (r'(analyzer|help|info|list|orient|quit|similarity|terms)$', Keyword),
      # keywords followed by arguments in single line
      (r'(analyzer|count|explain|get|index|info|list|optimize|'
       r'orient|quit|remove|search|similarity|terms|tokens)(\s+)(.*?)$',
       bygroups(Keyword, Text, Generic.Strong)),
      # rest of the text
      (r'.*\n', Text)
    ]
  }

For my own convenience, I create another Python script to replace the pygmentize script. I plan to call this script on source files I want to colorize (with my defaults, such as line numbering and style set to emacs mode). Unlike pygmentize, it will allow me to work with my own custom Lexers. Right now, it takes in a block of text and uses the first line of the text (where I usually put in the full path name of the file) to figure out the Lexer to call. If it cannot figure it out, then it asks the user. In the future, I plan on adding the file name as another way to figure out the correct lexer to use, but that's not done yet.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/python
from pygments import highlight
from pygments.lexers import *
from pygments.formatters import HtmlFormatter
from mypygmentlexers import *
import re

LEXER_MAPPINGS = {
  "java" : JavaLexer(),
  "xml" : XmlLexer(),
  "scala" : ScalaLexer(),
  "python" : PythonLexer(),
  "py" : PythonLexer(),
  "bash" : BashLexer(),
  "sh" : BashLexer(),
  "javascript" : JavascriptLexer(),
  "css" : CssLexer(),
  "jsp" : JspLexer(),
  "lucli" : LucliLexer(),
  "text" : TextLexer()
}

def askLexer(text):
  """
  Display the text on console and ask the user what it is. Must be one
  of the patterns in LEXER_MAPPINGS.
  """
  print '==== text ===='
  print text
  print '==== /text ===='
  while 1:
    ctype = raw_input("Specify type (" + ",".join(LEXER_MAPPINGS.keys()) + "): ")
    try:
      lexer = LEXER_MAPPINGS[ctype]
      break
    except KeyError:
      print 'Sorry, invalid type, try again'
  return lexer

def guessLexer(text):
  """
  Uses the file name metadata or shebang info on first line if it exists
  and try to "guess" the lexer that is required for colorizing.
  """
  firstline = text[0:text.find("\n")]
  match = re.search(r'[/|.]([a-zA-Z]+)$', firstline)
  if (match):
    guess = match.group(1)
    try:
      return LEXER_MAPPINGS[guess]
    except KeyError:
      return askLexer(text)
  else:
    return askLexer(text)

def colorize(text, lexer):
  """
  Calls the pygments API to colorize text with appropriate defaults (for my
  use) for inclusion a HTML page.
  """
  formatter = HtmlFormatter(linenos=True)
  return highlight(text, lexer, formatter)

def fileToString(filename):
  """
  Convenience method to read a file into a String.
  """
  infile = open(filename, 'rb')
  contents = infile.read()
  infile.close()
  return contents

def writeOutputFile(filename, coloredCode):
  """
  Convenience method to write the colored code into the named output file
  with the appropriate surrounding html markup and style sheet.
  """
  file = open(filename, 'wb')
  file.write("<html>\n<head>\n<style>\n")
  file.write(fileToString("/tmp/styles.css"))
  file.write("</style>\n</head>\n</body>\n")
  file.write(coloredCode)
  file.write("</body>\n</html>\n")
  file.close()

def testSingleInput(infile, outfile):
  """
  Test class to convert a named input file into a colorized versoin into the
  named output file. Can be applied to any kind of file.
  """
  print("Processing " + infile + " -> " + outfile)
  code = fileToString(infile)
  writeOutputFile(outfile, colorize(code, guessLexer(code)))

def main():
  testSingleInput("/tmp/test.lucli", "/tmp/test1.html")

if __name__ == "__main__":
    main();

Running it from the command line (without arguments) produces the following output. The input file (hardcoded in the test method) is a typical Lucli session. The output file is the rendered output. For testing convenience, I build full HTML files (with stylesheet) that I can look at in my browser. The snippet below is the colorized body, since the CSS is already in my template.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
Lucene CLI. Please specify index. Type 'help' for instructions.
lucli> help
 analyzer: Set/Unset custom analyzer, default is StandardAnalyzer. Ex: analyzer [analyzer_class]
 count: Return number of results from search. Ex: count query
 explain: Generates explanation for the query. Ex: explain query
 get: Return the record at the specified position. Ex: rec 0
 help: Display command help
 index: Choose new lucene index. Ex: index my_index...
 info: Display info about current Lucene index.
 list: List all or named fields in index. Ex: list f1 f2...
 optimize: Optimize current index
 orient: Set result display orientation, default horizontal. Ex: orient [vertical|horizontal]
 quit: Quit/Exit Lucli
 remove: Remove record at specified position. Ex: remove 0
 search: Search current index. Ex: search query
 similarity: Set/Unset custom similarity, default is DefaultSimilarity
 terms: Show first 100 terms in this index. Can filter by field name if supplied. Ex: terms [field]
 tokens: Returns top 10 tokens for each document (Verbose)
lucli> index src/test/resources/movieindex1   
Index has 99998 documents 
All Fields: [released, body, title]
Indexed Fields: [released, body, title]
lucli> search +title:"happy days" +body:"fonz"
13 total matching documents
-------------------- Result-#:      0, DocId:  18851 --------------------
score                  :  6.010070
released         (I-S-): 1974
title            (ITS-): Happy Days  - Because It's There  #11.1
-------------------- Result-#:      1, DocId:  18855 --------------------
score                  :  6.010070
released         (I-S-): 1974
title            (ITS-): Happy Days  - Cruisin'  #2.16
-------------------- Result-#:      2, DocId:  18858 --------------------
score                  :  5.372923
released         (I-S-): 1974
title            (ITS-): Happy Days  - Fonzie Moves In  #3.1
-------------------- Result-#:      3, DocId:  18863 --------------------
score                  :  5.372923
released         (I-S-): 1974
title            (ITS-): Happy Days  - Hardware Jungle  #1.5
-------------------- Result-#:      4, DocId:  18883 --------------------
score                  :  5.372923
released         (I-S-): 1974
title            (ITS-): Happy Days  - Tall Story  #8.17
lucli> quit

Anyway, thats pretty much it for this week. I think that many Python programmers may already be using Pygments, but it is usable for non-Python programmers as well. Pygments has support for an enormous number of languages, so chances are that you will not have to write a line of Python code to use it, and even if you do, the API looks more complex than it really is, so I expect that you will be pleasantly surprised at how easy it is to build your own custom Lexer.

You may have noticed that the salmon background in my <pre>..</pre> blocks have disappeared from my previous blogs. I had to do this in order to make the colorizing work. The HTML formatter wraps the code in a pre block, which is in turn wrapped in a table/td block, so if I override the behavior of my pre tag, that has precedence over the behavior specified by Pygments, so I end up getting salmon colored margins and code background, which looks pretty horrible with the colorized code in it. I do have plans of replacing all my pre blocks with colorized versions throughout the blog, but that is a slightly larger job than I am willing to tackle right now.

Tuesday, February 03, 2009

Lucli with Lucene 2.4/Java 6

We recently upgraded at work from Lucene 2.0 to Lucene 2.4 to pick up some performance related bugfixes. I've written before about enhancing Lucli, a command line tool for querying Lucene indexes, written by Dror Matalon. Specifically, I added in a very primitive scripting support, so people could write little Lucli "macros" and run them in batch.

At the time, I assumed that the Lucli code was being maintained and I wanted to contribute my changes back, so I was trying to be as unobtrusive and as well-behaved as possible. Since then, it appears that Lucli is neither actively maintained nor used - I think people with large enterprise-level indexes probably prefer a web based interface - so I figured it would be safe (and educational, given the API changes between Lucene 2.0 and 2.4) to create my own version with Lucene 2.4 and Java 1.6. So here it is.

The code shown below has the following new features, compared to the original Lucli version 2.4 code from which it was adapted.

  1. Rewritten using Java 1.6/Lucene 2.4
  2. Scripting interface described here.
  3. New commands: list, get, remove, analyzer, similarity, orient
  4. Expects queries in Lucene's Query Syntax.
  5. Results display (ITSV) metadata.
  6. Support for multisearcher mode for search based operations
  7. Removed pagination code from Lucli, uses JLine's pagination support instead
  8. New LucliCommands Enum, makes addition of new commands slightly easier than original version.

Rewritten using Java 1.6/Lucene 2.4

This probably benefits me more than anybody else. However, it is more convenient to use the newer Java idioms and results in less (and somewhat more readable, IMO) code. It also resulted in a slightly simpler (one step less) way of adding a new command to Lucli (described in detail later).

My main aim in doing this was to learn the new Lucene 2.4 API, so I guess its a no-brainer. Some changes I am aware of after this exercise are:

  1. The Hits class is deprecated.
  2. Score normalization is no longer done (see this bug for details).
  3. Support for reusable Tokens returned from a TokenStream.
  4. Many improvements on the indexing side.

list

This is functionality I had put in earlier, but the difference is a more natural way (space separated instead of colon separated) of specifying the fields that should show up in the output. The command allows the user to dump a list of Documents from the index in Document Id order. This is probably not all that useful in interactive mode, but it can be used in script mode to answer questions about whether a certain document (or documents) are available in the index or not. Here is a little demo.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
sirocco:~$ lucli.sh                                       
Lucene CLI. Please specify index. Type 'help' for instructions.
lucli> index src/test/resources/movieindex1
Index has 99999 documents
All Fields: [released, body, title]
Indexed Fields: [released, body, title]
lucli> list title
-------------------- Result-#:      0, DocId:      0 --------------------
title            (ITS-): $1,000,000 Chance of a Lifetime
-------------------- Result-#:      1, DocId:      1 --------------------
title            (ITS-): $10,000 Pyramid, The
-------------------- Result-#:      2, DocId:      2 --------------------
title            (ITS-): $100,000 Name That Tune
-------------------- Result-#:      3, DocId:      3 --------------------
title            (ITS-): $25 Million Dollar Hoax
...

get

The get command allows you to print a document with the given document id. A possible use for it is to drill down into a particular document that was found by dumping out the list and pattern matching with grep or similar tool. For example, here is Document 3.

1
2
3
4
5
lucli> get 3
-------------------- Result-#:      0, DocId:      3 --------------------
released         (I-S-): 2004
title            (ITS-): $25 Million Dollar Hoax
lucli>

remove

The remove command allows you to remove a document from the index. Obviously, this is a dangerous command, but it can be useful, for example, when required to quickly remove a document from search results without having to regenerate the index. To delete our document #3, we can do this:

1
2
3
4
5
lucli> remove 3
Document 3 deleted
lucli> get 3
ERROR: attempt to access a deleted document
lucli> 

analyzer

Lucli 2.4 had the Analyzer hard coded to StandardAnalyzer. Since we (probably like quite a few other companies) build our indexes using a custom Analyzer, I needed a way to set the custom Analyzer into Lucli. The 'analyzer' command does this. If no arguments are supplied, the current Analyzer class name is returned. To specify an Analyzer, the class name of the Analyzer must be specified in the 'analyzer' command, the Analyzer class must exist on the classpath, and the Analyzer must have have a public default (zero-argument) constructor. For example:

1
2
3
4
5
lucli> analyzer
Analyzer set: org.apache.lucene.analysis.standard.StandardAnalyzer
lucli> analyzer com.mycompany.customextensions.analysis.MyCustomAnalyzer
Analyzer set: com.mycompany.customextensions.analysis.MyCustomAnalyzer
lucli> 

similarity

Like the Analyzer, Lucli 2.4 uses DefaultSimilarity as its default Similarity implementation. We use a custom similarity implementation for one of our index tiers, so having a way to set this was important for us. The 'similarity' command is similar to the 'analyzer' command. If supplied without arguments, the current Similarity setting is returned. To set a custom Similarity implementation, the full class name of the Similarity implementation must be specified, the Similarity class must exist on the classpath, and the Similarity must have a public default constructor. For example:

1
2
3
4
5
lucli> similarity                                                   
Similarity set: org.apache.lucene.search.DefaultSimilarity
lucli> similarity org.apache.lucene.search.DefaultSimilarity
Similarity set: org.apache.lucene.search.DefaultSimilarity
lucli> 

orient

Lucli was originally intended to serve as an interactive tool, so its display format reflects that. However, when used in batch mode, you typically want to use other Unix (or other) tools on the output, and the one-line-per-document format works better in such cases. So the 'orient' command allows the user to set the orientation. The default is set to vertical. A sample session is shown below, showing the usage of the 'orient' command and the result of setting the orientations.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
lucli> orient              
Result orientation is vertical
lucli> search title:fonz
3 total matching documents

-------------------- Result-#:      0, DocId:  15827 --------------------
score                  :  5.563311
released         (I-S-): 1980
title            (ITS-): Fonz and the Happy Days Gang, The
-------------------- Result-#:      1, DocId:  18845 --------------------
score                  :  4.867897
released         (I-S-): 1974
title            (ITS-): Happy Days  - A.K.A. the Fonz  #4.10
-------------------- Result-#:      2, DocId:  14808 --------------------
score                  :  4.172483
released         (I-S-): 1999
title            (ITS-): Family Guy  - The Father, the Son and the Holy Fonz  #4.18
lucli> orient horizontal
lucli> orient
Result orientation is horizontal
lucli> search title:fonz
3 total matching documents

##resultId;docId;score;released;title
0;15827 5.563311;1980;Fonz and the Happy Days Gang, The
1;18845 4.867897;1974;Happy Days  - A.K.A. the Fonz  #4.10
2;14808 4.172483;1999;Family Guy  - The Father, the Son and the Holy Fonz  #4.18
lucli> 

Lucene Query Syntax expected

Lucli 2.4 goes to great lengths to make the query interface behave like a web search box. It takes plain strings, and attempts to match it with every indexable field using a MultiFieldQueryParser. The current one is simpler - it still supports the plain string approach, but matches it to the default field, as shown below:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
lucli> explain "fonz"
Rewritten query: body:fonz
16 total matching documents

-------------------- Result-#:      0, DocId:  15828 --------------------
score                  :  2.138932
released         (I-S-): 2000
title            (ITS-): Food 911  - The Spaghetti Kid  #4.33
Explanation            : 2.138932 = (MATCH) fieldWeight(body:fonz in 15828), product of:
  1.4142135 = tf(termFreq(body:fonz)=2)
  9.679702 = idf(docFreq=16, numDocs=99998)
  0.15625 = fieldNorm(field=body, doc=15828)
-------------------- Result-#:      1, DocId:  18848 --------------------
...

However, the people who are going to use my version of the tool are likely to be more comfortable writing queries (or pasting them from the logs) in the Lucene query syntax. So I have made the query parsing less user-friendly and more programmer-friendly. The result is the ability to specify exactly what you want, for example:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
lucli> search title:fonz
3 total matching documents

-------------------- Result-#:      0, DocId:  15827 --------------------
score                  :  5.563311
released         (I-S-): 1980
title            (ITS-): Fonz and the Happy Days Gang, The
-------------------- Result-#:      1, DocId:  18845 --------------------
score                  :  4.867897
released         (I-S-): 1974
title            (ITS-): Happy Days  - A.K.A. the Fonz  #4.10
-------------------- Result-#:      2, DocId:  14808 --------------------
score                  :  4.172483
released         (I-S-): 1999
title            (ITS-): Family Guy  - The Father, the Son and the Holy Fonz  #4.18
lucli> search +title:fonz +released:1999
1 total matching documents

-------------------- Result-#:      0, DocId:  14808 --------------------
score                  :  5.574712
released         (I-S-): 1999
title            (ITS-): Family Guy  - The Father, the Son and the Holy Fonz  #4.18
lucli> 

ITSV Metadata

Knowing the ITSV (Indexed, Tokenized, Stored, Term Vector) metadata for a field can be very useful. For example, a field is searchable and sortable only if it indexed, and sometimes you may forget to set it to indexed when building the index and your search will not pick up the Document, even though you know the Document is there. Well, anyway, this is probably a newbie mistake, but I find that being able to see the ITSV metadata for a field can help in certain cases. Its already available in Luke, so I figured, why not put it in here. You will see this in the vertical orientation only. For example, as you can see below, the released field is Indexed and Stored and the title is Indexed, Tokenized and Stored.

1
2
3
4
5
6
7
8
lucli> search title:"happy days"
56 total matching documents

-------------------- Result-#:      0, DocId:  18843 --------------------
score                  :  7.991356
released         (I-S-): 1970
title            (ITS-): Happy Days
...

Multisearcher Support

Lucli 2.4 did not provide multisearcher support, but sometimes its helpful to see what Documents are being returned from a multisearcher for a given query. Certain commands, however, (such as list, get or remove) are based on a single index backing the searcher, so these commands do not work in multisearcher mode. To build a multisearcher, simply specify multiple paths in the 'index' command. The example below shows how to do this:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
lucli> index src/test/resources/movieindex1
Index has 99999 documents 
All Fields: [released, body, title]
Indexed Fields: [released, body, title]
lucli> index src/test/resources/movieindex1 src/test/resources/movieindex2
Index has 147520 documents 
All Fields: [released, body, title]
Indexed Fields: [released, body, title]
Multisearcher mode set
lucli> list title
ERROR: Impossible operation, 'index' to reset to single searcher mode

The first call above is to open the movieindex1 index, which has 99,999 documents. The second call opens a multisearcher on the movieindex1 and movieindex2 indexes, with a combined document count of 147,520 documents. As you can see, trying to run the 'list' command results in an error.

JLine Pagination

Lucli 2.4 had the pagination logic for showing 10 records per "page" built into the LuceneMethods.printHits() method. Since JLine already has support for pagination, I decided to move this logic out to JLine. JLine computes the terminal height and considers each line to be newline terminated, so if your index has long fields spanning multiple lines, you will still have to use your terminal scrollbar. Also, the --More-- prompt does not allow you to back up a screen, unlike the behavior of the Unix less. But hopefully, this will show up with a future release of JLine. Here is what it looks like:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
lucli> search +title:"happy days" +body:"fonz"
13 total matching documents
-------------------- Result-#:      0, DocId:  18851 --------------------
score                  :  6.010070
released         (I-S-): 1974
title            (ITS-): Happy Days  - Because It's There  #11.1
-------------------- Result-#:      1, DocId:  18855 --------------------
score                  :  6.010070
released         (I-S-): 1974
title            (ITS-): Happy Days  - Cruisin'  #2.16
-------------------- Result-#:      2, DocId:  18858 --------------------
score                  :  5.372923
released         (I-S-): 1974
title            (ITS-): Happy Days  - Fonzie Moves In  #3.1
-------------------- Result-#:      3, DocId:  18863 --------------------
score                  :  5.372923
released         (I-S-): 1974
title            (ITS-): Happy Days  - Hardware Jungle  #1.5
-------------------- Result-#:      4, DocId:  18883 --------------------
score                  :  5.372923
released         (I-S-): 1974
title            (ITS-): Happy Days  - Tall Story  #8.17
-------------------- Result-#:      5, DocId:  18868 --------------------
score                  :  5.190344
released         (I-S-): 1974
--More--

Also, the only results that are part of this pagination is the output of the LuceneMethods.printHits() call. Unlike the original code, a Collection<String> is created by printHits() and passed to Lucli.print() which wraps the call to the JLine paginated output method. So if there is other output, such as the count information above, this is not paginated. The current design also introduces a noticeable lag, at least for the 'list' command, as it builds a Collection of all the formatted output. So, in all, while relying on JLine pagination reduces the application code, I may need to revisit the decision and build the pagination code back into the printHits method so it does a page per call.

Adding/Deleting Commands

To add a command:

  1. Add an enum entry for your command into LucliCommand.java. The arguments for the enum are the Lucli command name, the help message and the minimum number of parameters the command needs to work. For example, 'similarity' can work with 0 parameters, while 'index' needs at least 1 parameter.
  2. Add the case logic for the new enum element in the switch block of the Lucli.handleCommand() method. The preferred approach is to delegate to a method in LucliMethods.java.
  3. Add the method that does what you want in LucliMethods.java

Deleting a command is the reverse of adding a command:

  1. Remove or comment out the enum entry for the command in LucliCommand.java
  2. Comment out the case logic inside the handleCommand() method's switch block in Lucli.java

You will need to recompile for the changes to take effect.

The code for my version of Lucli is shown below. I've used the pygmentize script from the Pygments project to colorize the code, hopefully its easier to read than the code I have been sharing in my previous blogs.

Code: Lucli.java

This is the main class that is called from the shell. If you have the original Lucene distribution, then the jar file is created inside lib, and you can use the run.sh script to run it.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
// Source: src/main/java/org/apache/lucene/contrib/lucli/Lucli.java
package org.apache.lucene.contrib.lucli;

/* ====================================================================
 * The Apache Software License, Version 1.1
 *
 * Copyright (c) 2001 The Apache Software Foundation.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
 *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
 *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 */
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;

import java.util.Collection;
import jline.ArgumentCompletor;
import jline.Completor;
import jline.ConsoleReader;
import jline.FileNameCompletor;
import jline.History;
import jline.SimpleCompletor;

import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.queryParser.ParseException;

/**
 * Main class for lucli: the Lucene Command Line Interface.
 * This class handles mostly the actual CLI part, command names, help, etc.
 */
public class Lucli {

  public final static String DEFAULT_FIELDNAME = "body";
 public final static String HISTORYFILE = ".lucli";
 public final static int MAX_TERMS = 100;
  public final static int MAX_TOKENS = 10;

 private String historyFile;
 private File script = null;
  private ConsoleReader consoleReader = null;

 /**
   * Main entry point. The first argument can be a filename with an
   * application initialization file.
  */
 public Lucli(String[] args) throws IOException {
  String line;
  historyFile = System.getProperty("user.home") +
      File.separator + HISTORYFILE;
  // parse command line arguments (in case we want to use Lucli in script mode)
  parseArgs(args);
  if (script != null) {
    consoleReader = new ConsoleReader(new FileInputStream(script),
        new PrintWriter(System.out));
  } else {
    consoleReader = new ConsoleReader();
      consoleReader.setHistory(new History(new File(historyFile)));
      consoleReader.setUsePagination(true);
  }
  // set completer with list of words
    Completor[] comp = new Completor[] {
      new SimpleCompletor(LucliCommand.getNames()),
      new FileNameCompletor()
    };
    consoleReader.addCompletor (new ArgumentCompletor(comp));
  // main input loop
  LuceneMethods luceneMethods = new LuceneMethods();
  while (true) {
   try {
    line = consoleReader.readLine("lucli> ");
    if (line != null) {
          handleCommand(line, consoleReader, luceneMethods);
    }
   } catch (java.io.EOFException e) {
    System.out.println(""); //new line
    exit();
   } catch (UnsupportedEncodingException e) {
    e.printStackTrace(System.err);
   } catch (IOException e) {
    e.printStackTrace(System.err);
   }
  }
 }

  protected static void message(String s) {
    System.out.println(s);
  }

  protected static void error(String s) {
    message("ERROR: " + s);
  }

 private void handleCommand(String line, ConsoleReader cr,
      LuceneMethods luceneMethods) {
    String[] words = tokenizeCommand(line);
    if (words.length == 0) {
      return; // white space
    }
    if (line.trim().startsWith("#")) {
      return; // # == comment
    }
    LucliCommand command = LucliCommand.fromName(words[0]);
    if (command.minParams > words.length - 1) {
      error(command.help);
      return;
    }
    try {
      switch (command) {
        case INFO:
          luceneMethods.info();
          break;
        case SEARCH:
          print(luceneMethods.search(joinTail(words), false, false));
          break;
        case OPTIMIZE:
          luceneMethods.optimize();
          break;
        case QUIT:
          luceneMethods.closeIndex();
          exit();
        case HELP:
          help();
          break;
        case COUNT:
          luceneMethods.count(joinTail(words));
          break;
        case TERMS:
          luceneMethods.terms(joinTail(words));
          break;
        case INDEX:
          luceneMethods.resetIndex(joinTail(words));
          break;
        case TOKENS:
          print(luceneMethods.search(joinTail(words), false, true));
          break;
        case EXPLAIN:
          print(luceneMethods.search(joinTail(words), true, false));
          break;
        case LIST:
          print(luceneMethods.list(joinTail(words)));
          break;
        case ANALYZER:
          luceneMethods.analyzer(joinTail(words));
          break;
        case SIMILARITY:
          luceneMethods.similarity(joinTail(words));
          break;
        case GET:
          print(luceneMethods.getRecord(joinTail(words)));
          break;
        case ORIENTATION:
          luceneMethods.orientResults(joinTail(words));
          break;
        case REMOVE:
          luceneMethods.removeRecord(joinTail(words));
          break;
        case NOCOMMAND:
          // do nothing
          break;
        case UNKNOWN:
        default:
          error("Unknown command: " + words[0] +
            ", type help for list of commands");
          break;
      }
    } catch (ParseException e) {
      error("Malformed query: " + e.getMessage());
      dumpStack(e);
    } catch (ClassNotFoundException e) {
      error("Class not found: " + e.getMessage());
      dumpStack(e);
    } catch (InstantiationException e) {
      error("Cannot instantiate class: " + e.getMessage() +
        ", ensure default constructor");
      dumpStack(e);
    } catch (IllegalAccessException e) {
      error("Unable to access class: " + e.getMessage());
      dumpStack(e);
    } catch (CorruptIndexException e) {
      error("Index is corrupt");
      dumpStack(e);
    } catch (IOException e) {
      error(e.getMessage());
      dumpStack(e);
    } catch (IllegalArgumentException e) {
      error(e.getMessage());
      dumpStack(e);
    }
  }

  /**
   * Removes the first word from the tokenized word array and returns
   * the rest of it joined into a single string.
   * @param words the array of tokens.
   * @return the string consisting of words[1..words.length-1].
   */
  private String joinTail(String[] words) {
    if (words.length > 1) {
      StringBuilder buf = new StringBuilder();
      for (int i = 1; i < words.length; i++) {
        buf.append(words[i]).append(" ");
      }
      return buf.toString().trim();
    } else {
      return "";
    }
  }

  /**
   * Tokenize the input command line by splitting by space.
   * @param line the input line to be tokenized.
   * @return an array of tokens.
   */
 private String[] tokenizeCommand(String line) {
    return line.split("[ \t]");
 }

 private void exit() {
  System.exit(0);
 }

  /**
   * Prints the help message. This is the "usage" in interactive mode.
   */
 private void help() {
    for (LucliCommand command : LucliCommand.getMap().values()) {
      System.out.println("\t" + command.name + ": " + command.help);
    }
 }

 /*
  * Only parse command line argument --file (or -f).
  */
 private void parseArgs(String[] args) {
   String errorMessage = null;
  if (args.length > 0) {
    if (args.length == 2 && 
        ("--file".equals(args[0]) || "-f".equals(args[0]))) {
      File scriptfile = new File(args[1]);
      if (scriptfile.exists() && scriptfile.canExecute()) {
        this.script = scriptfile;
        return;
      } else {
        errorMessage = "File:" + args[1] +
            " does not exist or is not executable";
      }
    }
   usage(errorMessage);
   System.exit(1);
  }
 }

  /**
   * Prints command line usage information. Includes an error message if
   * appropriate. This is useful in script mode. The error message serves
   * as an indication of where the problem is in the script.
   * @param errorMessage the error thrown by the Lucli command.
   */
 private void usage(String errorMessage) {
  message("Usage: lucli.Lucli [--file script_file]");
  if (errorMessage != null) {
    error(errorMessage);
  }
 }

  /**
   * Dumps the stacktrace in interactive mode. Useful during testing.
   * @param e the Exception to print.
   */
  private void dumpStack(Exception e) {
//    e.printStackTrace();
  }

  /**
   * Paginated printing method for a collection.
   * @param ss the collection of strings to print with pagination.
   */
  private void print(Collection<String> ss) {
    try {
      consoleReader.printColumns(ss);
    } catch (IOException e) {
      error("IO Exception: " + e.getMessage());
    }
  }

  /**
   * This is how we are called.
   * @param args command line arguments. Usage is:
   *        Lucli [--file script_file]
   * @throws IOException if one is thrown.
   */
  public static void main(String[] args) {
    try {
    new Lucli(args);
    } catch (IOException e) {
      // this is thrown by the jline framework and should probably never
      // happen, but we catch it here and report it just in case it does.
      error(e.getMessage());
      e.printStackTrace(System.err);
      System.exit(0);
    }
 }
}

Code: LucliCommand.java

This is a newly introduced Enum that contains all the commands that are available from the Lucli interface, along with their descriptions and the minimum number of parameters that they need to operate. This was originally achieved in Lucli using an inner Command class that did about the same thing, but with Enums we don't need a command id anymore. Also, having an Enum reduces the number of places one needs to change when adding or removing commands from Lucli.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
// Source: src/main/java/org/apache/lucene/contrib/lucli/LucliCommand.java
package org.apache.lucene.contrib.lucli;

/* ====================================================================
 * The Apache Software License, Version 1.1
 *
 * Copyright (c) 2001 The Apache Software Foundation.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
 *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
 *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 */

import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;

/**
 * Enumeration of commands available in Lucli.
 */
public enum LucliCommand {

  NOCOMMAND ("nocommand", null, 0),
 UNKNOWN ("unknown", null, 0),
 INFO ("info", "Display info about current Lucene index.", 0),
 SEARCH ("search", "Search current index. Ex: search query", 1),
 OPTIMIZE ("optimize", "Optimize current index", 0),
 QUIT ("quit", "Quit/Exit Lucli", 0),
 HELP ("help", "Display command help", 0),
 COUNT ("count", "Return number of results from search. Ex: count query", 1),
 TERMS ("terms", "Show first " + Lucli.MAX_TERMS + 
    " terms in this index. Can filter by field name if supplied. " +
    "Ex: terms [field]", 0),
 INDEX ("index", "Choose new lucene index. Ex: index my_index...", 1),
 TOKENS ("tokens", "Returns top " + Lucli.MAX_TOKENS +
    " tokens for each document (Verbose)", 1),
 EXPLAIN ("explain",
    "Generates explanation for the query. Ex: explain query", 1),
 LIST ("list", "List all or named fields in index. Ex: list f1 f2...", 0),
  ANALYZER ("analyzer",
    "Set/Unset custom analyzer, default StandardAnalyzer. " +
    "Ex: analyzer [analyzer_class]", 0),
  GET ("get", "Return the record at the specified position. Ex: rec 0", 1),
  ORIENTATION ("orient",
    "Set result display orientation, default horizontal. " +
    "Ex: orient [vertical|horizontal]", 0),
  SIMILARITY ("similarity",
    "Set/Unset custom similarity, default DefaultSimilarity", 0),
  REMOVE("remove", "Remove record at specified position. Ex: remove 0", 1);

  public String name;
  public String help;
  public int minParams;

  LucliCommand(String name, String help, int minParams) {
    this.name = name;
    this.help = help;
    this.minParams = minParams;
  }

  private static boolean initialized = false;
  private static SortedMap<String,LucliCommand> commandMap =
    new TreeMap<String,LucliCommand>();

  public static LucliCommand fromName(String name) {
    initialize();
    if (commandMap.containsKey(name)) {
      return commandMap.get(name);
    } else {
      return LucliCommand.UNKNOWN;
    }
  }

  public static Map<String,LucliCommand> getMap() {
    initialize();
    return commandMap;
  }

  public static String[] getNames() {
    initialize();
    return commandMap.keySet().toArray(new String[0]);
  }

  private static void initialize() {
    if (! initialized) {
      for (LucliCommand command : LucliCommand.values()) {
        if (command.help != null) {
          commandMap.put(command.name, command);
        }
      }
      initialized = true;
    }
  }
}

Code: LuceneMethods.java

This is almost all Lucene. Apart from the new commands which I have described above, I have tried to use the new features of Lucene 2.4 to address the existing methods.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
// Source: src/main/java/org/apache/lucene/contrib/lucli/LuceneMethods.java
package org.apache.lucene.contrib.lucli;

/* ====================================================================
 * The Apache Software License, Version 1.1
 *
 * Copyright (c) 2001 The Apache Software Foundation.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
 *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
 *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 */
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.TreeMap;
import java.util.Map.Entry;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;

import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.IndexReader.FieldOption;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searchable;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TopDocs;

/**
 * Various methods that interact with Lucene and provide info about the 
 * index, search, etc. Parts addapted from Lucene demo.
 */
class LuceneMethods {

  private String indexName;
  private Searcher searcher;
  private Analyzer analyzer;
  private Similarity similarity;
  private boolean orientVertical = true; // suitable for interactive use
  private boolean isMultisearcher = false;

  public LuceneMethods() {
    super();
    this.analyzer = new StandardAnalyzer();
    this.similarity = new DefaultSimilarity();
    Lucli.message("Lucene CLI. Please specify index. Type 'help' for instructions.");
  }

  /**
   * Called from "index" command. Attempts to open the new index. If it can,
   * it resets the current index to the one pointed at by indexName.
   * @param indexName the index name to set.
   * @throws CorruptIndexException
   * @throws IOException
   */
  public void resetIndex(String indexName) 
      throws CorruptIndexException, IOException {
    Searcher newSearcher = openIndex(indexName, false);
    closeIndex(); // close the current index
    this.searcher = newSearcher;
    if (isMultisearcher) {
      this.indexName = indexName.split(" ")[0];
    } else {
      this.indexName = indexName;
    }
  }

  /**
   * Closes the index before quitting. Also called from the resetIndex method
   * to close the handle for the previous searcher, if it exists.
   */
  public void closeIndex() {
    if (searcher != null) {
      try {
        searcher.close();
      } catch (IOException e) {
        // swallow it, not much we can do here
      }
    }
  }

  /**
   * Called from "info" command. Delegates to the private version of this
   * method which checks to see if the index is valid and prints some
   * information about the index.
   * @throws IOException if index does not exist or is corrupt.
   */
  public void info() throws IOException {
    info(this.indexName, this.searcher);
  }

  /**
   * Allows setting/resetting the similarity implementation used by the
   * searcher. Default is DefaultSimilarity. If called without an argument,
   * returns the currently configured Similarity. This method is called from
   * the "similarity" command.
   * @param similarity the full class name of the Similarity implementation.
   * @throws ClassNotFoundException
   * @throws InstantiationException
   * @throws IllegalAccessException
   */
  public void similarity(String similarity) 
      throws ClassNotFoundException, InstantiationException,
      IllegalAccessException {
    if (similarity.trim().length() == 0) {
      this.similarity = new DefaultSimilarity();
    } else {
      Similarity newSimilarity =
        (Similarity) Class.forName(similarity).newInstance();
      this.similarity = newSimilarity;
    }
    Lucli.message("Similarity set: " + this.similarity.getClass().getName());
  }
  
  /**
   * Allows setting/resetting the Analyzer. Default is StandardAnalyzer. If
   * called without arguments, returns the class name of the Analyzer in use.
   * Called from the "analyzer" command.
   * @param analyzer the class name of the analyzer to set, or empty string.
   */
  public void analyzer(String analyzer) 
      throws ClassNotFoundException, InstantiationException,
      IllegalAccessException {
    if (analyzer.trim().length() == 0) {
      // reset analyzer back to default
      this.analyzer = new StandardAnalyzer();
    } else {
      Analyzer newAnalyzer = (Analyzer) Class.forName(analyzer).newInstance();
      this.analyzer = newAnalyzer;
    }
    Lucli.message("Analyzer set: " + this.analyzer.getClass().getName());
  }

  /**
   * Allows setting result orientation. Default is vertical, which is more
   * convenient for interactive use. Horizontal orientation is more convenient
   * for batch use for scripting, etc. Called from the "orient" command. If
   * supplied without arguments, returns the current orientation setting.
   * @param orientation the orientation to set (vertical or horizontal).
   */
  public void orientResults(String orientation) {
    if (orientation == null || orientation.trim().length() == 0) {
      // return the current orientation
      Lucli.message("Result orientation is " +
        (this.orientVertical ? "vertical" : "horizontal"));
    } else {
      if (orientation.startsWith("vert")) {
        this.orientVertical = true;
      } else if (orientation.startsWith("hori")) {
        this.orientVertical = false;
      } else {
        Lucli.message("Invalid orientation, valid values vertical or horizontal");
      }
    }
  }

  /**
   * Called from "count" command. Returns the number of documents satisfying
   * the query. The query must be a well formed query in Lucene's query syntax.
   * @param queryString the query string.
   * @throws IOException
   * @throws ParseException
   */
  public void count(String queryString) throws IOException, ParseException {
    Query query = parse(queryString);
    int count = initSearch(query).length;
    Lucli.message(count + " total documents");
  }

  /**
   * Called from the "search", "list", "count", and "explain" commands.
   * Sends a query to the underlying index and returns the results. Can also
   * print the explain plan for the query, as well as the tokens returned.
   * @param queryString the Lucene query string.
   * @param explain if true, shows the explanation for each document returned.
   * @param showTokens if true, shows the tokens found for each document.
   * @throws IOException
   * @throws ParseException
   */
  public List<String> search(String queryString, boolean explain,
      boolean showTokens) throws IOException, ParseException {
    List<String> results = new ArrayList<String>();
    Query query = parse(queryString);
    ScoreDoc[] scoreDocs = initSearch(query);
    if (explain) {
      Lucli.message("Rewritten query: " + query.toString());
    }
    Lucli.message(scoreDocs.length + " total matching documents");
    int nresult = 0;
    for (ScoreDoc scoreDoc : scoreDocs) {
      int docId = scoreDoc.doc;
      float score = scoreDoc.score;
      Document doc = searcher.doc(docId);
      results.addAll(printHit(doc, docId, score, nresult, null,
        query, showTokens, explain));
      nresult++;
    }
    return results;
  }

  /**
   * Lists all records in the index. If a field list is supplied, then only
   * prints the fields named in the list. If no list is supplied, all the
   * fields are output.
   * @param fieldList a space separated list of field names.
   * @throws CorruptIndexException
   * @throws IOException
   */
  public List<String> list(String fieldList)
      throws CorruptIndexException, IOException {
    validateIndexSet();
    validateOperationPossible();
    List<String> results = new ArrayList<String>();
    IndexReader reader = null;
    try {
      reader = IndexReader.open(indexName);
      Set<String> fields = new HashSet<String>();
      if (fieldList.trim().length() == 0) {
        fields.addAll(reader.getFieldNames(FieldOption.ALL));
      } else {
        fields.addAll(Arrays.asList(fieldList.split(" ")));
      }
      int numDocs = reader.maxDoc();
      for (int i = 0; i < numDocs; i++) {
        if (reader.isDeleted(i)) {
          continue;
        }
        Document doc = reader.document(i);
        results.addAll(printHit(doc, i, -1.0F, i, fields, null, false, false));
      }
    } finally {
      closeReader(reader);
    }
    return results;
  }

  /**
   * Gets the document specified by docId from the current index.
   * @param docId the docId of the document to get.
   * @throws NumberFormatException
   * @throws CorruptIndexException
   * @throws IOException
   */
  public List<String> getRecord(String docId) throws
      CorruptIndexException, IOException {
    validateIndexSet();
    validateOperationPossible();
    IndexReader reader = IndexReader.open(indexName);
    int numDocs = reader.maxDoc();
    int docid = validateDocId(docId, numDocs);
    List<String> results = new ArrayList<String>();
    try {
      Document doc = reader.document(docid);
      results.addAll(printHit(doc, docid, -1.0F, 0, null, null, false, false));
    } finally {
      closeReader(reader);
    }
    return results;
  }

  /**
   * Open the current index with a IndexWriter and optimize it.
   * @throws IOException
   */
  public void optimize() throws IOException {
    validateIndexSet();
    validateOperationPossible();
    IndexWriter writer = null;
    try {
      writer = new IndexWriter(indexName, analyzer, false,
        MaxFieldLength.UNLIMITED);
      Lucli.message("Starting to optimize index.");
      long start = System.currentTimeMillis();
      writer.optimize();
      writer.commit();
      Lucli.message("Done optimizing index. Took " +
        (System.currentTimeMillis() - start) + " msecs");
    } catch (IOException e) {
      Lucli.error("IO Error, Optimization may have failed");
    } finally {
      closeWriter(writer);
    }
  }

  public void removeRecord(String docId) throws CorruptIndexException,
      IOException {
    validateIndexSet();
    validateOperationPossible();
    IndexReader reader = null;
    try {
      reader = IndexReader.open(indexName);
      int numDocs = reader.maxDoc();
      int docid = validateDocId(docId, numDocs);
      reader.deleteDocument(docid);
      Lucli.message("Document " + docId + " deleted");
    } finally {
      closeReader(reader);
    }
  }

  /**
   * Provides a list of the top terms of the index.
   * @param field  - the name of the command or empty for all.
   * @throws CorruptIndexException
   * @throws IOException
   */
  public void terms(String field) throws CorruptIndexException, IOException {
    validateIndexSet();
    validateOperationPossible();
    SortedMap<String,Integer> termMap = new TreeMap<String,Integer>();
    IndexReader reader = null;
    try {
      reader = IndexReader.open(indexName);
      TermEnum terms = reader.terms();
      while (terms.next()) {
        Term term = terms.term();
        if ((field.trim().length() == 0) || field.equals(term.field())) {
          termMap.put(term.field() + ":" + term.text(),
            new Integer((terms.docFreq())));
        }
      }
      int nkeys = 0;
      for (String key : termMap.keySet()) {
        Lucli.message(key + ": " + termMap.get(key));
        nkeys++;
        if (nkeys > Lucli.MAX_TERMS) {
          break;
        }
      }
    } finally {
      closeReader(reader);
    }
  }

  /**
   * Opens a new index and sets the searcher variable. If multiple index names
   * are supplied to the index command, a Multisearcher is created. In multi-
   * searcher mode, a number of commands that depend on a single searcher
   * such as get, remove, etc, will refuse to work until a single index is
   * set (so it switches to using a single searcher).
   * @param indexName the name of the index.
   * @param setSearcher if true, sets the searcher with the new searcher.
   * @return a Searcher object representing the new index.
   * @throws CorruptIndexException
   * @throws IOException
   */
  private Searcher openIndex(String indexName, boolean setSearcher)
      throws CorruptIndexException, IOException {
    Searcher newSearcher = null;
    if (indexName.contains(" ")) {
      isMultisearcher = true;
      String[] indexNames = indexName.split(" ");
      Searchable[] searchables = new Searchable[indexNames.length];
      for (int i = 0; i < searchables.length; i++) {
        searchables[i] = new IndexSearcher(indexNames[i]);
      }
      newSearcher = new MultiSearcher(searchables);
      if (setSearcher) {
        this.searcher = newSearcher;
        this.indexName = indexNames[0];
      }
      info(indexNames[0], newSearcher);
    } else {
      isMultisearcher = false;
      newSearcher = new IndexSearcher(indexName);
      if (setSearcher) {
        this.searcher = newSearcher;
        this.indexName = indexName;
      }
      info(indexName, newSearcher);
    }
    return newSearcher;
  }

  /**
   * Common functionality to report on the current state of an index.
   * @param indexName the index name.
   * @param searcher the Searcher reference.
   * @throws IOException
   */
  private void info(String indexName, Searcher searcher) throws IOException {
    Lucli.message("Index has " + searcher.maxDoc() + " documents ");
    Lucli.message("All Fields: " + getFieldsInfo(indexName, FieldOption.ALL));
    Lucli.message("Indexed Fields: " +
      getFieldsInfo(indexName, FieldOption.INDEXED));
    if (isMultisearcher) {
      Lucli.message("Multisearcher mode set");
    }
  }

  /**
   * Returns the field names in the index.
   * @param indexName the index name.
   * @param fieldOption a filter to show a subset of the field names.
   * @return a Stringified list of all field names.
   * @throws CorruptIndexException
   * @throws IOException
   */
  private String getFieldsInfo(String indexName, FieldOption fieldOption)
          throws CorruptIndexException, IOException {
    IndexReader reader = IndexReader.open(indexName);
    Collection<String> fieldnames = reader.getFieldNames(fieldOption);
    return fieldnames.toString();
  }

  /**
   * Convenience method to print out a Document into a List of formatted
   * strings. Behavior differs based on current orientation setting. The
   * List is passed into the ConsoleReader's printCollection method to
   * ensure pagination.
   * @param doc the Document to print into a List.
   * @param docId the document id of the document.
   * @param score the score for the document.
   * @param resultId the result id for the current search results.
   * @param fieldList the field list to show. This is only used in case of
   * the 'list' command, other commands such as 'search' and 'get' print all
   * fields in the Document.
   * @param query the Query passed in, is null if called from 'list' or 'get'
   * commands.
   * @param showTokens true or false.
   * @param explainQuery true or false.
   * @return a List of Strings representing a single Document.
   * @throws IOException
   */
  private List<String> printHit(Document doc, int docId, float score,
      int resultId, Set<String> fieldList, Query query, boolean showTokens,
      boolean explainQuery) throws IOException {
    List<String> results = new ArrayList<String>();
    List<Field> fields = doc.getFields();
    if (orientVertical) {
      results.add(String.format(
        "-------------------- Result-#: %6d, DocId: %6d --------------------%n",
        resultId, docId));
      if (score > 0.0F) {
        results.add(String.format("%-23s: %9.6f%n", "score", score));
      }
      for (Field field : fields) {
        String fieldName = field.name();
        if (fieldList != null && (! fieldList.contains(fieldName))) {
          continue;
        }
        String fieldValue = doc.get(fieldName);
        results.add(String.format("%-16s (%s%s%s%s): %s%n", fieldName,
          (field.isIndexed() ? "I" : "-"),
          (field.isTokenized() ? "T" : "-"),
          (field.isStored() ? "S" : "-"),
          (field.isTermVectorStored() ? "V" : "-"), fieldValue));
      }
      if (showTokens) {
        results.add(String.format("%-23s: %s%n", "Tokens", invertDocument(doc)));
      }
      if (explainQuery && query != null) {
        results.add(String.format("%-23s: %s%n", "Explanation",
          explainQuery(query, docId)));
      }
    } else {
      StringBuilder buf;
      if (resultId == 0) {
        // if this is the first record, also print the heading
        buf = new StringBuilder();
        buf.append("##resultId;docId");
        if (score > 0.0F) {
          buf.append(";score");
        }
        for (Field field : fields) {
          buf.append(";").append(field.name());
        }
        if (showTokens) {
          buf.append(";Tokens");
        }
        if (explainQuery && query != null) {
          buf.append(";Explanation");
        }
        results.add(buf.toString());
      }
      buf = new StringBuilder();
      buf.append(resultId).append(";").append(docId);
      if (score > 0.0F) {
        buf.append(String.format("%9.6f", score));
      }
      for (Field field : fields) {
        buf.append(";").append(doc.get(field.name()));
      }
      if (showTokens) {
        buf.append(";").append(invertDocument(doc));
      }
      if (explainQuery && query != null) {
        buf.append(";").
            append(explainQuery(query, docId).replaceAll("\n", "<br/>"));
      }
      results.add(buf.toString());
    }
    return results;
  }

  /**
   * Calculates and returns the Explanation for the specified Query
   * and docId.
   * @param query the query to analyze.
   * @param docId the docId to generate the explanation for.
   * @throws IOException
   */
  private String explainQuery(Query query, int docId) throws IOException {
    validateIndexSet();
    return searcher.explain(query, docId).toString();
  }

  /**
   * Parse the query string into a Query object using Lucene's QueryParser.
   * @param queryString the Lucene query string to parse.
   * @return
   * @throws java.io.IOException
   * @throws org.apache.lucene.queryParser.ParseException
   */
  private Query parse(String queryString) throws IOException, ParseException {
    QueryParser parser = new QueryParser(Lucli.DEFAULT_FIELDNAME, analyzer);
    Query query = null;
    query = parser.parse(queryString);
    return query;
  }

  /**
   * Searches using the passed in query. Unlike the original Lucli, the
   * assumption here is that the user knows Lucene's query syntax, so they
   * can send in the fully formed query.
   * @param queryString the query in Lucene's query syntax.
   * @return an array of ScoreDoc objects.
   * @throws IOException if one is thrown by underlying Lucene index.
   * @throws ParseException if there was a problem with the query.
   */
  private ScoreDoc[] initSearch(Query query) throws IOException {
    validateIndexSet();
    int maxDocs = searcher.maxDoc();
    TopDocs topDocs = searcher.search(query, null, maxDocs);
    return topDocs.scoreDocs;
  }

  /**
   * Find all the tokens for the various fields (indexed and tokenized) fields
   * in this document and return it in decreasing frequency order.
   * @param doc the Document to analyze.
   * @return the result string.
   * @throws IOException
   */
  private String invertDocument(Document doc) throws IOException {
    Map<String,Integer> counts = new HashMap<String,Integer>();
    for (Object obj : doc.getFields()) {
      Field field = (Field) obj;
      if (field.isIndexed() && field.isTokenized()) {
        Reader reader = null;
        if (field.readerValue() != null) {
          reader = field.readerValue();
        } else if (field.stringValue() != null) {
          reader = new StringReader(field.stringValue());
        } else {
          continue;
        }
        TokenStream stream = analyzer.tokenStream(field.name(), reader);
        try {
          Token t = new Token();
          while ((t = stream.next(t)) != null) {
            String term = t.term();
            if (counts.containsKey(term)) {
              int orig = counts.get(term);
              counts.put(term, orig + 1);
            } else {
              counts.put(term, 1);
            }
          }
        } finally {
          stream.close();
        }
      }
    }
    List<Entry<String, Integer>> entryList =
      new ArrayList<Entry<String, Integer>>();
    entryList.addAll(counts.entrySet());
    Collections.sort(entryList, new Comparator<Entry<String, Integer>>() {
      public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) {
        return o2.getValue().compareTo(o1.getValue());
      }
    });
    StringBuilder tokbuf = new StringBuilder();
    int ntoks = 0;
    for (Entry<String, Integer> entry : entryList) {
      if (ntoks > 0) {
        tokbuf.append(",");
      }
      tokbuf.append(entry.getKey()).append("(").
              append(entry.getValue()).append(")");
      ntoks++;
    }
    return tokbuf.toString();
  }

  /**
   * Convenience method to close an IndexReader quietly.
   * @param reader the IndexReader to close.
   */
  private void closeReader(IndexReader reader) {
    if (reader != null) {
      try {
        reader.close();
      } catch (Exception e) {
        // swallow it, not much we can do at this stage
      }
    }
  }

  /**
   * Convenience method to close an IndexWriter quietly.
   * @param writer the IndexWriter to close.
   */
  private void closeWriter(IndexWriter writer) {
    if (writer != null) {
      try {
        writer.close();
      } catch (Exception e) {
        // swallow it, not much we can do at this stage
      }
    }
  }

  /**
   * Checks to see if the indexName and searcher global variables are
   * set, else throws an IllegalArgumentException.
   * @throws IllegalArgumentException
   */
  private void validateIndexSet() {
    if (indexName == null || searcher == null) {
      throw new IllegalArgumentException(LucliCommand.INDEX.help);
    }
  }

  /**
   * Checks that the docId supplied is numeric and within range, else throws
   * an IllegalArgumentException.
   * @param docId the docId to look up or remove.
   * @param numDocs the number of documents in the Index.
   * @return the docId as an Int.
   */
  private int validateDocId(String docId, int numDocs) {
    int docid = -1;
    try {
      docid = Integer.parseInt(docId);
    } catch (NumberFormatException e) {
      throw new IllegalArgumentException("docId must be numeric");
    }
    if ((docid < 0) || (docid > numDocs - 1)) {
      throw new IllegalArgumentException(
        "DocId must be in range:[0.." + (numDocs - 1) + "]");
    }
    return docid;
  }

  private void validateOperationPossible() {
    if (isMultisearcher) {
      throw new IllegalArgumentException(
        "Impossible operation, 'index' to reset to single searcher mode");
    }
  }
}