View Javadoc
1   package org.wikimedia.search.extra.analysis.filters;
2   
3   import java.io.IOException;
4   
5   import javax.annotation.Nullable;
6   
7   import org.apache.lucene.analysis.TokenFilter;
8   import org.apache.lucene.analysis.TokenStream;
9   import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
10  import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter;
11  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
12  import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
13  import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
14  import org.apache.lucene.analysis.util.TokenFilterFactory;
15  
16  import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
17  
18  /**
19   * A token filter that wraps another one to preserve original terms at the same position.
20   * The purpose is very similar to {@link KeywordRepeatFilter}+{@link RemoveDuplicatesTokenFilter}
21   * but this approach does not require that the filters support the keyword attribute.
22   */
23  // TODO: check if the behaviour of equals() is actually what is expected. Read
24  // https://sourceforge.net/p/findbugs/bugs/1379/ before blindly adding an
25  // equals() method to PreserveOriginalFilter.
26  @SuppressFBWarnings(
27          value = "EQ_DOESNT_OVERRIDE_EQUALS",
28          justification = "equals() as defined in org.apache.lucene.util.AttributeSource seems strong enough.")
29  public class PreserveOriginalFilter extends TokenFilter {
30      private final CharTermAttribute cattr;
31      private final PositionIncrementAttribute posIncr;
32      private final OriginalTermAttribute original;
33      @Nullable private State preserve;
34  
35      /**
36       * Builds a new PreserveOriginalFilter, the input TokenStream must be filtered by a PreserveOriginalFilter.Recorder.
37       *
38       * @param input input
39       * @throws IllegalArgumentException if the analysis chain does not contain an OriginalTermAttribute
40       */
41      public PreserveOriginalFilter(TokenStream input) {
42          super(input);
43          cattr = getAttribute(CharTermAttribute.class);
44          posIncr = addAttribute(PositionIncrementAttribute.class);
45          original = getAttribute(OriginalTermAttribute.class);
46          if (original == null) {
47              throw new IllegalArgumentException("PreserveOriginalFilter must be used with a PreserveOriginalFilter.Recorder fitler in the same analysis chain.");
48          }
49      }
50  
51      /**
52       * Constructor using lucene factory classes.
53       *
54       * @param input original input stream
55       * @param wrapped token filter we want to wrap
56       */
57      public PreserveOriginalFilter(TokenStream input, TokenFilterFactory wrapped) {
58          this(wrapped.create(new Recorder(input)));
59      }
60  
61      @Override
62      public final boolean incrementToken() throws IOException {
63          if (preserve != null) {
64              restoreState(preserve);
65              cattr.copyBuffer(original.buffer(), 0, original.length());
66              posIncr.setPositionIncrement(0);
67              preserve = null;
68              return true;
69          }
70  
71          if (input.incrementToken()) {
72              if (!original.equals(cattr)) {
73                  preserve = captureState();
74              }
75              return true;
76          } else {
77              return false;
78          }
79      }
80  
81      /**
82       * A simple filter that records a copy of the current token in the OriginalTermAttribute attribute.
83       */
84      public static class Recorder extends TokenFilter {
85          private final OriginalTermAttribute original = this.addAttribute(OriginalTermAttribute.class);
86          private final CharTermAttribute cattr = this.addAttribute(CharTermAttribute.class);
87          public Recorder(TokenStream input) {
88              super(input);
89          }
90  
91          /* (non-Javadoc)
92           * @see org.apache.lucene.analysis.TokenStream#incrementToken()
93           */
94          @Override
95          public final boolean incrementToken() throws IOException {
96              if (input.incrementToken()) {
97                  original.copyBuffer(cattr.buffer(), 0, cattr.length());
98                  return true;
99              }
100             return false;
101         }
102     }
103 
104     /**
105      * A copy of {@link CharTermAttribute} taken by {@link Recorder}.
106      * This copy is restored by {@link PreserveOriginalFilter} at the same position if
107      * the token is different.
108      */
109     public interface OriginalTermAttribute extends CharTermAttribute {}
110 
111     /* (non-Javadoc)
112      * @see org.apache.lucene.analysis.attributes.CharTermAttributeImpl
113      *
114      * Everything we need is already implemented by CharTermAttributeImpl. But
115      * the way attributes work makes it impossible to reuse existing
116      * implementations for new attributes without defining a new
117      * Interface/InterfaceImpl pair.
118      */
119     public static class OriginalTermAttributeImpl extends CharTermAttributeImpl implements OriginalTermAttribute {}
120 }