001    /**   
002     * Copyright 2011 The Buzz Media, LLC
003     * 
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     *
008     *   http://www.apache.org/licenses/LICENSE-2.0
009     *
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    package com.thebuzzmedia.sjxp;
017    
018    import java.io.IOException;
019    import java.io.InputStream;
020    import java.io.UnsupportedEncodingException;
021    import java.nio.charset.Charset;
022    import java.util.ArrayList;
023    import java.util.HashMap;
024    import java.util.List;
025    import java.util.Map;
026    
027    import org.xmlpull.v1.XmlPullParser;
028    import org.xmlpull.v1.XmlPullParserException;
029    import org.xmlpull.v1.XmlPullParserFactory;
030    
031    import com.thebuzzmedia.sjxp.rule.IRule;
032    import com.thebuzzmedia.sjxp.rule.IRule.Type;
033    
034    /**
035     * Class used to define a parser that makes parsing using the performance of an
036     * XML Pull Parser with the ease of XPath-like expressions possible.
037     * 
038     * <h3>Thread Safety</h3> This class is not thread-safe, however instances of
039     * {@link XMLParser} can safely be re-used to parse multiple files once the
040     * previous parse operation is done.
041     * 
042     * @param <T>
043     *            The class type of any user-supplied object that the caller wishes
044     *            to be passed through from one of the {@link XMLParser}'s
045     *            <code>parse</code> methods directly to the handler when an
046     *            {@link IRule} matches. This is typically a data storage mechanism
047     *            like a DAO or cache used to store the parsed value in some
048     *            valuable way, but it can ultimately be anything. If you do not
049     *            need to make use of the user object, there is no need to
050     *            parameterize the class.
051     * 
052     * @author Riyad Kalla (software@thebuzzmedia.com)
053     */
054    public class XMLParser<T> {
055            /**
056             * Flag used to indicate if debugging output has been enabled by setting the
057             * "sjxp.debug" system property to <code>true</code>. This value will be
058             * <code>false</code> if the "sjxp.debug" system property is undefined or
059             * set to <code>false</code>.
060             * <p/>
061             * This system property can be set on startup with:<br/>
062             * <code>
063             * -Dsjxp.debug=true
064             * </code> or by calling {@link System#setProperty(String, String)} before
065             * this class is loaded.
066             * <p/>
067             * This is <code>false</code> by default.
068             */
069            public static final Boolean DEBUG = Boolean.getBoolean("sjxp.debug");
070    
071            /**
072             * Flag used to indicate if this parser should be namespace-aware by setting
073             * the "sjxp.namespaces" system property to <code>true</code>. This value
074             * will be <code>true</code> if the "sjxp.namespaces" system property is
075             * undefined. Namespace awareness can only be disabled by setting this
076             * system property to <code>false</code>.
077             * <p/>
078             * <strong>NOTE</strong>: If you intentionally disable namespace awareness,
079             * any {@link IRule} you provide that uses namespace qualified values (e.g.
080             * [http://w3.org/text]book) will fail to match as the parser can no longer
081             * see namespace URIs.
082             * <p/>
083             * This system property can be set on startup with:<br/>
084             * <code>
085             * -Dsjxp.namespaces=true
086             * </code> or by calling {@link System#setProperty(String, String)} before
087             * this class is loaded.
088             * <p/>
089             * This is <code>true</code> by default.
090             */
091            public static final Boolean ENABLE_NAMESPACES = (System
092                            .getProperty("sjxp.namespaces") == null ? Boolean.TRUE : Boolean
093                            .getBoolean("sjxp.namespaces"));
094    
095            /**
096             * Flag used to indicate if this parser should validate the parsed XML
097             * against the references DTD or XML Schema by setting the "sjxp.validation"
098             * system property to <code>true</code>. This value will be
099             * <code>false</code> if the "sjxp.validation" system property is undefined
100             * or set to <code>false</code>.
101             * <p/>
102             * This system property can be set on startup with:<br/>
103             * <code>
104             * -Dsjxp.validation=true
105             * </code> or by calling {@link System#setProperty(String, String)} before
106             * this class is loaded.
107             * <p/>
108             * This is <code>false</code> by default.
109             */
110            public static final Boolean ENABLE_VALIDATION = Boolean
111                            .getBoolean("sjxp.validation");
112    
113            /**
114             * Prefix to every log message this library logs. Using a well-defined
115             * prefix helps make it easier both visually and programmatically to scan
116             * log files for messages produced by this library.
117             * <p/>
118             * The value is "[sjxp] " (including the space).
119             */
120            public static final String LOG_MESSAGE_PREFIX = "[sjxp] ";
121    
122            /**
123             * Singleton {@link XmlPullParserFactory} instance used to create new
124             * underlying {@link XmlPullParser} instances for each instance of
125             * {@link XMLParser}.
126             */
127            public static final XmlPullParserFactory XPP_FACTORY;
128    
129            /**
130             * Static initializer used to init the {@link XmlPullParserFactory} with the
131             * configured namespace and validation settings.
132             */
133            static {
134                    if (DEBUG)
135                            log("Debug output ENABLED");
136    
137                    try {
138                            XPP_FACTORY = XmlPullParserFactory.newInstance();
139    
140                            // Configure pull parser features
141                            XPP_FACTORY.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES,
142                                            ENABLE_NAMESPACES);
143                            XPP_FACTORY.setFeature(XmlPullParser.FEATURE_VALIDATION,
144                                            ENABLE_VALIDATION);
145    
146                            if (DEBUG)
147                                    log("XmlPullParserFactory configured [namespaces=%s, validation=%s]",
148                                                    ENABLE_NAMESPACES, ENABLE_VALIDATION);
149                    } catch (XmlPullParserException e) {
150                            throw new RuntimeException(
151                                            "An exception occurred while calling XmlPullParserFactory.newInstance(). A library providing the impl of the XML Pull Parser spec (e.g. XPP3 or Android SDK) must be available at runtime.",
152                                            e);
153                    }
154            }
155    
156            /**
157             * Helper method used to ensure a message is loggable before it is logged
158             * and then pre-pend a universal prefix to all log messages generated by
159             * this library to make the log entries easy to parse visually or
160             * programmatically.
161             * <p/>
162             * If a message cannot be logged (logging is disabled) then this method
163             * returns immediately.
164             * <p/>
165             * <strong>NOTE</strong>: Because Java will auto-box primitive arguments
166             * into Objects when building out the <code>params</code> array, care should
167             * be taken not to call this method with primitive values unless
168             * {@link #DEBUG} is <code>true</code>; otherwise the VM will be spending
169             * time performing unnecessary auto-boxing calculations.
170             * 
171             * @param message
172             *            The log message in <a href=
173             *            "http://download.oracle.com/javase/6/docs/api/java/util/Formatter.html#syntax"
174             *            >format string syntax</a> that will be logged.
175             * @param params
176             *            The parameters that will be swapped into all the place holders
177             *            in the original messages before being logged.
178             * 
179             * @see #LOG_MESSAGE_PREFIX
180             */
181            protected static void log(String message, Object... params) {
182                    if (DEBUG)
183                            System.out.printf(LOG_MESSAGE_PREFIX + message + '\n', params);
184            }
185    
186            private String toStringCache;
187            private boolean continueParsing = true;
188    
189            private Location location;
190            private XmlPullParser xpp;
191    
192            private Map<Integer, List<IRule<T>>> tagRuleMap;
193            private Map<Integer, List<IRule<T>>> attrRuleMap;
194            private Map<Integer, List<IRule<T>>> charRuleMap;
195    
196            /**
197             * Create a new parser that uses the given {@link IRule}s when parsing any
198             * XML content.
199             * 
200             * @param rules
201             *            The rules applied to any parsed content.
202             * 
203             * @throws IllegalArgumentException
204             *             if <code>rules</code> is <code>null</code> or empty.
205             * @throws XMLParserException
206             *             if the {@link #XPP_FACTORY} is unable to create a new
207             *             {@link XmlPullParser} instance and throws an exception.
208             */
209            public XMLParser(IRule<T>... rules) throws IllegalArgumentException,
210                            XMLParserException {
211                    if (rules == null || rules.length == 0)
212                            throw new IllegalArgumentException(
213                                            "rules cannot be null or empty, you must provide at least 1 rule to execute otherwise parsing will do nothing.");
214    
215                    location = new Location();
216    
217                    try {
218                            xpp = XPP_FACTORY.newPullParser();
219                    } catch (XmlPullParserException e) {
220                            throw new XMLParserException(
221                                            "An exception occurred while trying to create a new XmlPullParser instance using the XmlPullParserFactory.",
222                                            e);
223                    }
224    
225                    // Load all the rules
226                    initRules(rules);
227            }
228    
229            /**
230             * Overridden to provide a nicely formatted representation of the parser for
231             * easy debugging.
232             * <p/>
233             * As an added bonus, since {@link XMLParser}s are intended to be immutable,
234             * the result of <code>toString</code> is cached on the first call and the
235             * cache returned every time to avoid re-computing the completed
236             * {@link String}.
237             * 
238             * @return a nicely formatted representation of the parser for easy
239             *         debugging.
240             */
241            @Override
242            public synchronized String toString() {
243                    if (toStringCache == null) {
244                            toStringCache = this.getClass().getName() + "[attributeRules="
245                                            + attrRuleMap + ", characterRules=" + charRuleMap + "]";
246                    }
247    
248                    return toStringCache;
249            }
250    
251            /**
252             * Used to indicate to the parser that you would like it to stop parsing.
253             * <p/>
254             * Internally the parser uses a simple <code>boolean</code> to indicate if
255             * it should keep parsing. A call to this method sets the boolean value to
256             * <code>false</code> which the parser checks at the next parse event and
257             * then stops.
258             * <p/>
259             * This is a safe operation that simply flips a flag to tell the underlying
260             * {@link XmlPullParser} to stop working after it's done with its current
261             * parse event and return from whichever <code>parse</code> method was
262             * called.
263             */
264            public void stop() {
265                    continueParsing = false;
266            }
267    
268            /**
269             * Parse the XML out of the given stream matching the {@link IRule}s
270             * provided when the {@link XMLParser} was instantiated.
271             * <p/>
272             * The underlying {@link XmlPullParser} will attempt to determine the
273             * stream's encoding based on the pull parser spec or fall back to a default
274             * of UTF-8.
275             * <p/>
276             * This class will make no attempt at closing the given {@link InputStream},
277             * the caller must take care to clean up that resource.
278             * <h3>Stopping Parsing</h3>
279             * Parsing can be safely stopped by calling {@link #stop()}. This allows
280             * {@link IRule} implementations control over stopping parsing, for example,
281             * if an arbitrary threshold is hit. A followup call to any of the
282             * <code>parse</code> methods will reset the stopped state.
283             * 
284             * @param source
285             *            The stream that XML content will be read out of.
286             * 
287             * @throws IllegalArgumentException
288             *             if <code>source</code> is <code>null</code>.
289             * @throws XMLParserException
290             *             if any error occurs with the underlying stream during parsing
291             *             of if the XML content itself is malformed and the underlying
292             *             pull parser cannot parse it.
293             */
294            public void parse(InputStream source) throws IllegalArgumentException,
295                            XMLParserException {
296                    try {
297                            parse(source, null, null);
298                    } catch (UnsupportedEncodingException e) {
299                            // no-op, this should never happen as null is a valid encoding.
300                    }
301            }
302    
303            /**
304             * Parse the XML out of the given stream matching the {@link IRule}s
305             * provided when the {@link XMLParser} was instantiated.
306             * <p/>
307             * The underlying {@link XmlPullParser} will attempt to determine the
308             * stream's encoding based on the pull parser spec or fall back to a default
309             * of UTF-8.
310             * <p/>
311             * This class will make no attempt at closing the given {@link InputStream},
312             * the caller must take care to clean up that resource.
313             * <h3>Stopping Parsing</h3>
314             * Parsing can be safely stopped by calling {@link #stop()}. This allows
315             * {@link IRule} implementations control over stopping parsing, for example,
316             * if an arbitrary threshold is hit. A followup call to any of the
317             * <code>parse</code> methods will reset the stopped state.
318             * 
319             * @param source
320             *            The stream that XML content will be read out of.
321             * @param userObject
322             *            The user-supplied object passed through from this parse method
323             *            to the matching {@link IRule}'s <code>handleXXX</code> method
324             *            when a match is found, or <code>null</code> if no user object
325             *            is needed. Passing through a user-object is just meant as a
326             *            convenience for giving the handler methods on the
327             *            {@link IRule}'s access to objects like DAOs that can be used
328             *            to persist or process parsed data easily.
329             * 
330             * @throws IllegalArgumentException
331             *             if <code>source</code> is <code>null</code>.
332             * @throws XMLParserException
333             *             if any error occurs with the underlying stream during parsing
334             *             of if the XML content itself is malformed and the underlying
335             *             pull parser cannot parse it.
336             */
337            public void parse(InputStream source, T userObject)
338                            throws IllegalArgumentException, XMLParserException {
339                    try {
340                            parse(source, null, userObject);
341                    } catch (UnsupportedEncodingException e) {
342                            // no-op, this should never happen as null is a valid encoding.
343                    }
344            }
345    
346            /**
347             * Parse the XML out of the given stream (producing content matching the
348             * given encoding) matching the {@link IRule}s provided when the
349             * {@link XMLParser} was instantiated.
350             * <p/>
351             * This class will make no attempt at closing the given {@link InputStream},
352             * the caller must take care to clean up that resource.
353             * <h3>Stopping Parsing</h3>
354             * Parsing can be safely stopped by calling {@link #stop()}. This allows
355             * {@link IRule} implementations control over stopping parsing, for example,
356             * if an arbitrary threshold is hit. A followup call to any of the
357             * <code>parse</code> methods will reset the stopped state.
358             * 
359             * @param source
360             *            The stream that XML content will be read out of.
361             * @param encoding
362             *            The character encoding (e.g. "UTF-8") of the data from the
363             *            given stream. If the encoding is not known, passing
364             *            <code>null</code> or calling {@link #parse(InputStream)}
365             *            instead will allow the underlying {@link XmlPullParser} to try
366             *            and automatically determine the encoding.
367             * 
368             * @throws IllegalArgumentException
369             *             if <code>source</code> is <code>null</code>.
370             * @throws UnsupportedEncodingException
371             *             if <code>encoding</code> represents an encoding name that is
372             *             not recognized by {@link Charset#isSupported(String)}
373             * @throws XMLParserException
374             *             if any error occurs with the underlying stream during parsing
375             *             of if the XML content itself is malformed and the underlying
376             *             pull parser cannot parse it.
377             */
378            public void parse(InputStream source, String encoding)
379                            throws IllegalArgumentException, UnsupportedEncodingException,
380                            XMLParserException {
381                    parse(source, encoding, null);
382            }
383    
384            /**
385             * Parse the XML out of the given stream (producing content matching the
386             * given encoding) matching the {@link IRule}s provided when the
387             * {@link XMLParser} was instantiated.
388             * <p/>
389             * This class will make no attempt at closing the given {@link InputStream},
390             * the caller must take care to clean up that resource.
391             * <h3>Stopping Parsing</h3>
392             * Parsing can be safely stopped by calling {@link #stop()}. This allows
393             * {@link IRule} implementations control over stopping parsing, for example,
394             * if an arbitrary threshold is hit. A followup call to any of the
395             * <code>parse</code> methods will reset the stopped state.
396             * 
397             * @param source
398             *            The stream that XML content will be read out of.
399             * @param encoding
400             *            The character encoding (e.g. "UTF-8") of the data from the
401             *            given stream. If the encoding is not known, passing
402             *            <code>null</code> or calling {@link #parse(InputStream)}
403             *            instead will allow the underlying {@link XmlPullParser} to try
404             *            and automatically determine the encoding.
405             * @param userObject
406             *            The user-supplied object passed through from this parse method
407             *            to the matching {@link IRule}'s <code>handleXXX</code> method
408             *            when a match is found, or <code>null</code> if no user object
409             *            is needed. Passing through a user-object is just meant as a
410             *            convenience for giving the handler methods on the
411             *            {@link IRule}'s access to objects like DAOs that can be used
412             *            to persist or process parsed data easily.
413             * 
414             * @throws IllegalArgumentException
415             *             if <code>source</code> is <code>null</code>.
416             * @throws UnsupportedEncodingException
417             *             if <code>encoding</code> represents an encoding name that is
418             *             not recognized by {@link Charset#isSupported(String)}
419             * @throws XMLParserException
420             *             if any error occurs with the underlying stream during parsing
421             *             of if the XML content itself is malformed and the underlying
422             *             pull parser cannot parse it.
423             */
424            public void parse(InputStream source, String encoding, T userObject)
425                            throws IllegalArgumentException, UnsupportedEncodingException,
426                            XMLParserException {
427                    if (source == null)
428                            throw new IllegalArgumentException("source cannot be null");
429                    if (encoding != null) {
430                            // If empty, ensure it is null so XPP gets encoding from XML header
431                            if (encoding.trim().length() == 0)
432                                    encoding = null;
433                            // Extra-safe, make sure the provided encoding is valid
434                            else if (!Charset.isSupported(encoding))
435                                    throw new UnsupportedEncodingException(
436                                                    "Encoding ["
437                                                                    + encoding
438                                                                    + "] is not a valid charset encoding in this runtime according to Charset.isSupported(encoding).");
439                    }
440    
441                    try {
442                            xpp.setInput(source, encoding);
443    
444                            if (DEBUG)
445                                    log("Underlying XmlPullParser input set [type=InputStream, encoding=%s (null is OK), userObject=%s]",
446                                                    xpp.getInputEncoding(), (userObject == null ? ""
447                                                                    : userObject));
448                    } catch (XmlPullParserException e) {
449                            throw new XMLParserException(
450                                            "Unable to set the given InputStream (with an optional encoding of '"
451                                                            + encoding
452                                                            + "') as input for the underlying XmlPullParser.",
453                                            e);
454                    }
455    
456                    try {
457                            doParse(userObject);
458                    } catch (IOException e) {
459                            throw new XMLParserException(
460                                            "An exception occurred while parsing the given source, the XML document may be malformed.",
461                                            e);
462                    } catch (XmlPullParserException e) {
463                            throw new XMLParserException(
464                                            "An error with the underlying data stream being parsed occurred.",
465                                            e);
466                    }
467            }
468    
469            protected void initRules(IRule<T>... rules) {
470                    // calculate a rough optimal size for the rule maps
471                    int optSize = (rules.length > 64 ? rules.length * 2 : 64);
472    
473                    // init the rule maps
474                    tagRuleMap = new HashMap<Integer, List<IRule<T>>>(optSize);
475                    attrRuleMap = new HashMap<Integer, List<IRule<T>>>(optSize);
476                    charRuleMap = new HashMap<Integer, List<IRule<T>>>(optSize);
477    
478                    // init the rules
479                    List<IRule<T>> ruleList = null;
480    
481                    for (int i = 0, length = rules.length; i < length; i++) {
482                            IRule<T> rule = rules[i];
483    
484                            switch (rule.getType()) {
485                            case TAG:
486                                    // Get the rule list for this path
487                                    ruleList = tagRuleMap.get(rule.getLocationPath());
488    
489                                    // If there wasn't already a rule list, create and add it
490                                    if (ruleList == null) {
491                                            ruleList = new ArrayList<IRule<T>>(3);
492                                            tagRuleMap.put(rule.getLocationPath().hashCode(), ruleList);
493                                    }
494                                    break;
495    
496                            case ATTRIBUTE:
497                                    // Get the rule list for this path
498                                    ruleList = attrRuleMap.get(rule.getLocationPath());
499    
500                                    // If there wasn't already a rule list, create and add it
501                                    if (ruleList == null) {
502                                            ruleList = new ArrayList<IRule<T>>(3);
503                                            attrRuleMap
504                                                            .put(rule.getLocationPath().hashCode(), ruleList);
505                                    }
506                                    break;
507    
508                            case CHARACTER:
509                                    // Get the rule list for this path
510                                    ruleList = charRuleMap.get(rule.getLocationPath());
511    
512                                    // If there wasn't already a rule list, create and add it
513                                    if (ruleList == null) {
514                                            ruleList = new ArrayList<IRule<T>>(3);
515                                            charRuleMap
516                                                            .put(rule.getLocationPath().hashCode(), ruleList);
517                                    }
518                                    break;
519                            }
520    
521                            // Add the rule to the list for the given path
522                            ruleList.add(rule);
523                    }
524    
525                    if (DEBUG)
526                            log("Initialized %d TAG rules, %d ATTRIBUTE rules and %d CHARACTER rules.",
527                                            tagRuleMap.size(), attrRuleMap.size(), charRuleMap.size());
528            }
529    
530            /**
531             * Uses the underlying {@link XmlPullParser} to begin parsing through the
532             * XML content from the given stream. This method's implementation is
533             * simple, acting like a traffic-cop responding to
534             * {@link XmlPullParser#START_TAG}, {@link XmlPullParser#TEXT},
535             * {@link XmlPullParser#END_TAG} and {@link XmlPullParser#END_DOCUMENT}
536             * events by calling the appropriate <code>doXXX</code> methods.
537             * <p/>
538             * Developers creating a subclass of {@link XMLParser} are meant to override
539             * one of the {@link #doStartTag(Object)}, {@link #doText(Object)},
540             * {@link #doEndTag(Object)} and {@link #doEndDocument(Object)} methods to
541             * add custom behavior and not necessarily override this central method.
542             * <h3>Stopping Parsing</h3>
543             * Parsing can be safely stopped by calling {@link #stop()}. This allows
544             * {@link IRule} implementations control over stopping parsing, for example,
545             * if an arbitrary threshold is hit. A followup call to any of the
546             * <code>parse</code> methods will reset the stopped state.
547             * 
548             * @param userObject
549             *            The user-supplied object passed through from this parse method
550             *            to the matching {@link IRule}'s <code>handleXXX</code> method
551             *            when a match is found, or <code>null</code> if no user object
552             *            is needed. Passing through a user-object is just meant as a
553             *            convenience for giving the handler methods on the
554             *            {@link IRule}'s access to objects like DAOs that can be used
555             *            to persist or process parsed data easily.
556             * 
557             * @throws IOException
558             *             if an error occurs with reading from the underlying
559             *             {@link InputStream} given to one of the public
560             *             <code>parse</code> methods.
561             * @throws XmlPullParserException
562             *             if an error occurs while parsing the XML content from the
563             *             underlying stream; typically resulting from malformed or
564             *             invalid XML.
565             */
566            protected void doParse(T userObject) throws IOException,
567                            XmlPullParserException {
568                    location.clear();
569                    continueParsing = true;
570    
571                    if (DEBUG)
572                            log("Parsing starting...");
573    
574                    long startTime = System.currentTimeMillis();
575    
576                    while (continueParsing) {
577                            switch (xpp.next()) {
578                            case XmlPullParser.START_TAG:
579                                    doStartTag(userObject);
580                                    break;
581    
582                            case XmlPullParser.TEXT:
583                                    doText(userObject);
584                                    break;
585    
586                            case XmlPullParser.END_TAG:
587                                    doEndTag(userObject);
588                                    break;
589    
590                            case XmlPullParser.END_DOCUMENT:
591                                    continueParsing = false;
592                                    doEndDocument(userObject);
593                                    break;
594                            }
595                    }
596    
597                    if (DEBUG) {
598                            long duration = System.currentTimeMillis() - startTime;
599                            log("Parse COMPLETE, elapsed time: %dms (approx %f seconds)",
600                                            duration, (double) duration / (double) 1000);
601                    }
602            }
603    
604            /**
605             * Used to process a {@link XmlPullParser#START_TAG} event.
606             * <p/>
607             * By default this updates the internal location state of the parser,
608             * processes all {@link IRule}s of type {@link Type#TAG} and processes all
609             * {@link IRule}s of type {@link Type#ATTRIBUTE} that match the parser's
610             * current location.
611             * 
612             * @param userObject
613             *            The user-supplied object passed through from this parse method
614             *            to the matching {@link IRule}'s <code>handleXXX</code> method
615             *            when a match is found, or <code>null</code> if no user object
616             *            is needed. Passing through a user-object is just meant as a
617             *            convenience for giving the handler methods on the
618             *            {@link IRule}'s access to objects like DAOs that can be used
619             *            to persist or process parsed data easily.
620             */
621            protected void doStartTag(T userObject) {
622                    // Update parser location
623                    location.push(xpp.getName(), xpp.getNamespace());
624    
625                    if (DEBUG)
626                            log("START_TAG: %s", location);
627    
628                    // Get the rules for the current path
629                    List<IRule<T>> tagRuleList = tagRuleMap.get(location
630                                    .getCachedHashCode());
631                    List<IRule<T>> attrRuleList = attrRuleMap.get(location
632                                    .getCachedHashCode());
633    
634                    // If there are no rules for the current path, then we are done.
635                    if ((tagRuleList == null || tagRuleList.isEmpty())
636                                    && (attrRuleList == null || attrRuleList.isEmpty()))
637                            return;
638    
639                    if (DEBUG)
640                            log("\t%d TAG rules and %d ATTR rules found for START_TAG...",
641                                            (tagRuleList == null ? 0 : tagRuleList.size()),
642                                            (attrRuleList == null ? 0 : attrRuleList.size()));
643    
644                    // Process the TAG rules
645                    if (tagRuleList != null) {
646                            for (int i = 0, size = tagRuleList.size(); i < size; i++) {
647                                    IRule<T> rule = tagRuleList.get(i);
648    
649                                    if (DEBUG)
650                                            log("\t\tRunning TAG Rule: %s", rule);
651    
652                                    rule.handleTag(this, true, userObject);
653                            }
654                    }
655    
656                    // Process the ATTR rules
657                    if (attrRuleList != null) {
658                            for (int i = 0, size = attrRuleList.size(); i < size; i++) {
659                                    IRule<T> rule = attrRuleList.get(i);
660    
661                                    if (DEBUG)
662                                            log("\t\tRunning ATTR Rule: %s", rule);
663    
664                                    String[] attrNames = rule.getAttributeNames();
665    
666                                    // Be safe, jump to the next rule if this one has no name
667                                    // entries
668                                    if (attrNames == null || attrNames.length == 0)
669                                            continue;
670    
671                                    /*
672                                     * PERFORMANCE: Generating the substrings is the fastest way to
673                                     * parse out the matching rules as it shares the same underlying
674                                     * char[] used to represent the entire location path or
675                                     * attribute name and just creates a new simple String instance
676                                     * with modified index/offset values that is GC'ed quickly and
677                                     * easily (uses a special package-protected String constructor).
678                                     * 
679                                     * Using regexp to match, splitting the rule or just about any
680                                     * other approach would have been magnitudes more expensive both
681                                     * in memory and CPU requirements than doing a simple substring.
682                                     */
683                                    for (int j = 0; j < attrNames.length; j++) {
684                                            String attrName = attrNames[j];
685                                            String localName = null;
686                                            String namespaceURI = null;
687    
688                                            // Parse the namespaceURI out of the name if necessary
689                                            if (attrName.charAt(0) == '[') {
690                                                    int endIndex = attrName.indexOf(']');
691    
692                                                    /*
693                                                     * Make sure the rule is valid so we avoid out of bounds
694                                                     * and keep the caller informed when their rules are
695                                                     * busted by failing fast.
696                                                     */
697                                                    if (endIndex <= 2)
698                                                            throw new XMLParserException(
699                                                                            "namespace URI for rule looks to be incomplete or empty for IRule: "
700                                                                                            + rule);
701    
702                                                    namespaceURI = attrName.substring(1, endIndex);
703                                            }
704    
705                                            int startIndex = (namespaceURI == null ? 0 : namespaceURI
706                                                            .length() + 2);
707    
708                                            /*
709                                             * Make sure the rule is valid so we avoid out of bounds and
710                                             * keep the caller informed when their rules are busted by
711                                             * failing fast.
712                                             */
713                                            if (attrName.length() - startIndex <= 1)
714                                                    throw new XMLParserException(
715                                                                    "local name for rule looks to be missing for IRule: "
716                                                                                    + rule);
717    
718                                            // Parse the local name
719                                            localName = attrName.substring(startIndex,
720                                                            attrName.length());
721    
722                                            // Give the parsed attribute value to the matching rule
723                                            rule.handleParsedAttribute(this, j,
724                                                            xpp.getAttributeValue(namespaceURI, localName),
725                                                            userObject);
726                                    }
727                            }
728                    }
729            }
730    
731            /**
732             * Used to process a {@link XmlPullParser#TEXT} event.
733             * <p/>
734             * By default this processes all {@link IRule}s of type
735             * {@link Type#CHARACTER} that match the parser's current location.
736             * 
737             * @param userObject
738             *            The user-supplied object passed through from this parse method
739             *            to the matching {@link IRule}'s <code>handleXXX</code> method
740             *            when a match is found, or <code>null</code> if no user object
741             *            is needed. Passing through a user-object is just meant as a
742             *            convenience for giving the handler methods on the
743             *            {@link IRule}'s access to objects like DAOs that can be used
744             *            to persist or process parsed data easily.
745             */
746            protected void doText(T userObject) {
747                    if (DEBUG)
748                            log("TEXT: %s", location);
749    
750                    // Get the rules for the current path
751                    List<IRule<T>> ruleList = charRuleMap.get(location.getCachedHashCode());
752    
753                    // If there are no rules for the current path, then we are done.
754                    if (ruleList == null || ruleList.isEmpty())
755                            return;
756    
757                    if (DEBUG)
758                            log("\t%d rules found for TEXT...", ruleList.size());
759    
760                    String text = xpp.getText();
761    
762                    // Give the parsed text to all matching IRules for this path
763                    for (int i = 0, size = ruleList.size(); i < size; i++) {
764                            IRule<T> rule = ruleList.get(i);
765    
766                            if (DEBUG)
767                                    log("\t\tRunning Rule: %s", rule);
768    
769                            rule.handleParsedCharacters(this, text, userObject);
770                    }
771            }
772    
773            /**
774             * Used to process a {@link XmlPullParser#END_TAG} event.
775             * 
776             * @param userObject
777             *            The user-supplied object passed through from this parse method
778             *            to the matching {@link IRule}'s <code>handleXXX</code> method
779             *            when a match is found, or <code>null</code> if no user object
780             *            is needed. Passing through a user-object is just meant as a
781             *            convenience for giving the handler methods on the
782             *            {@link IRule}'s access to objects like DAOs that can be used
783             *            to persist or process parsed data easily.
784             */
785            protected void doEndTag(T userObject) {
786                    // Get the rules for the current path
787                    List<IRule<T>> tagRuleList = tagRuleMap.get(location
788                                    .getCachedHashCode());
789    
790                    // If there are no rules for the current path, then we are done.
791                    if (tagRuleList != null && !tagRuleList.isEmpty()) {
792                            if (DEBUG)
793                                    log("\t%d TAG rules found for END_TAG...", tagRuleList.size());
794    
795                            // Process the TAG rules
796                            for (int i = 0, size = tagRuleList.size(); i < size; i++) {
797                                    IRule<T> rule = tagRuleList.get(i);
798    
799                                    if (DEBUG)
800                                            log("\t\tRunning TAG Rule: %s", rule);
801    
802                                    rule.handleTag(this, false, userObject);
803                            }
804                    }
805    
806                    // Update parser location
807                    location.pop();
808    
809                    if (DEBUG)
810                            log("END_TAG: %s", location);
811            }
812    
813            /**
814             * Used to process a {@link XmlPullParser#END_DOCUMENT} event.
815             * <p/>
816             * By default this method simply logs a debug statement if debugging is
817             * enabled, but this stub is provided to make overriding the default
818             * behavior easier if desired.
819             * 
820             * @param userObject
821             *            The user-supplied object passed through from this parse method
822             *            to the matching {@link IRule}'s <code>handleXXX</code> method
823             *            when a match is found, or <code>null</code> if no user object
824             *            is needed. Passing through a user-object is just meant as a
825             *            convenience for giving the handler methods on the
826             *            {@link IRule}'s access to objects like DAOs that can be used
827             *            to persist or process parsed data easily.
828             */
829            protected void doEndDocument(T userObject) {
830                    if (DEBUG)
831                            log("END_DOCUMENT, Parsing COMPLETE");
832            }
833    
834            /**
835             * Simple and fast class used to mock the behavior of a stack in the form of
836             * a string for the purposes of "pushing" and "popping" the parser's current
837             * location within an XML document as it processes START and END_TAG events.
838             * <p/>
839             * Performance is optimized by using a {@link StringBuilder} who's length is
840             * chopped (which just adjusts an <code>int</code> value) to simulate a
841             * "pop" off the top.
842             * <h3>Performance</h3>
843             * As of SJXP 2.0 String object creation and char[] duplication (e.g.
844             * {@link System#arraycopy(Object, int, Object, int, int)}) has been
845             * completely removed and replaced with using simple integer hash codes.
846             * <p/>
847             * The performance improvement is huge over the original toString-based
848             * method of matching {@link IRule}'s <code>locationPath</code>s against the
849             * parser's current location.
850             * 
851             * @author Riyad Kalla (software@thebuzzmedia.com)
852             */
853            class Location {
854                    private static final int HASH_CODE_CACHE_SIZE = 512;
855    
856                    private int hashCode;
857                    private Integer[] hashCodeCache;
858    
859                    private StringBuilder path;
860                    private List<Integer> lengthList;
861    
862                    /**
863                     * Creates a new empty location.
864                     */
865                    public Location() {
866                            hashCode = 0;
867                            hashCodeCache = new Integer[HASH_CODE_CACHE_SIZE];
868    
869                            path = new StringBuilder(256);
870                            lengthList = new ArrayList<Integer>(16);
871                    }
872    
873                    /**
874                     * Overridden to calculate the hash code of this location using the
875                     * exact same hash code calculation that {@link String#hashCode()} uses.
876                     * This allows us to say a <code>String</code> with the content
877                     * "/library/book/title" is equal to an instance of this class
878                     * representing the same location when doing lookups in a {@link Map}.
879                     * <p/>
880                     * This method calculates the hash code and then caches it, followup
881                     * calls to {@link #push(String, String)} or {@link #pop()} invalidate
882                     * the cached hash code allowing it to be recalculated again on the next
883                     * call.
884                     */
885                    @Override
886                    public int hashCode() {
887                            /*
888                             * If the hash code is already 0 and our path is empty, there is
889                             * nothing to compute so the hash code stays 0. Otherwise we drop
890                             * into the for-loop and calculate the String-equivalent hash code.
891                             */
892                            if (hashCode == 0 && path.length() > 0) {
893                                    for (int i = 0, length = path.length(); i < length; i++) {
894                                            hashCode = 31 * hashCode + path.charAt(i);
895                                    }
896                            }
897    
898                            return hashCode;
899                    }
900    
901                    /**
902                     * Used to get a cached {@link Integer} version of the <code>int</code>
903                     * {@link #hashCode()} return value.
904                     * <p/>
905                     * To avoid unnecessary {@link Integer} allocations, this method caches
906                     * up to a certain number of {@link Integer} instances, re-using them
907                     * every time the same hash code value comes back up and creating new
908                     * instances when it doesn't.
909                     * <p/>
910                     * If a larger number of {@link Integer} instances are created than the
911                     * underlying cache can hold, then a new instance will be created and
912                     * returned like normal.
913                     * <h3>Design</h3>
914                     * The reason this works so well for parsing XML is because of the
915                     * nested, tag-matching structure of XML. When considering unique paths
916                     * inside of an XML doc (e.g. "/library", "/library/book", etc.) there
917                     * are typically not that many; maybe 20, 50 or less than a 100 in most
918                     * cases.
919                     * <p/>
920                     * Once the hash code {@link Integer} values for these unique paths is
921                     * created and cached, once we re-encounter that path again and again,
922                     * we don't need to recreate that hash code {@link Integer}, we can just
923                     * use the one from the previous occurrence.
924                     * 
925                     * @return a cached {@link Integer} version of the <code>int</code>
926                     *         {@link #hashCode()} return value.
927                     */
928                    public Integer getCachedHashCode() {
929                            // Recalculate the hash code
930                            hashCode();
931    
932                            // Figure out the index, in our cache, where this value WOULD be.
933                            int index = hashCode % hashCodeCache.length;
934    
935                            // Absolute value only
936                            if (index < 0)
937                                    index = -index;
938    
939                            // Get the Integer we think represents our value.
940                            Integer value = hashCodeCache[index];
941    
942                            // If we haven't created an Integer for this value yet, do it now.
943                            if (value == null)
944                                    hashCodeCache[index] = (value = Integer.valueOf(hashCode));
945                            /*
946                             * If a collision has occurred and we have filled up our cache
947                             * already and the Integer we grabbed doesn't represent our int
948                             * value, forget the cache and just create a new Integer the old
949                             * fashion way and return it.
950                             * 
951                             * The hope is that the cache is always large enough that we only
952                             * ever hit it and have no misses like this.
953                             */
954                            else if (hashCode != value.intValue())
955                                    value = Integer.valueOf(hashCode);
956    
957                            return value;
958                    }
959    
960                    /**
961                     * Used to clear all the internal state of the location.
962                     */
963                    public void clear() {
964                            hashCode = 0;
965                            hashCodeCache = new Integer[HASH_CODE_CACHE_SIZE];
966    
967                            path.setLength(0);
968                            lengthList.clear();
969                    }
970    
971                    /**
972                     * "Pushes" a new local name and optional namespace URI onto the "stack"
973                     * by appending it to the current location path that represents the
974                     * parser's location inside of the XML doc.
975                     * 
976                     * @param localName
977                     *            The local name of the tag (e.g. "title").
978                     * @param namespaceURI
979                     *            Optionally, the full qualifying namespace URI for this
980                     *            tag.
981                     */
982                    public void push(String localName, String namespaceURI) {
983                            // Clear the hash code cache first to be safe.
984                            hashCode = 0;
985    
986                            // Remember the length before we inserted this last entry
987                            lengthList.add(path.length());
988    
989                            // Add separator
990                            path.append('/');
991    
992                            // Add the namespace URI if there is one.
993                            if (namespaceURI != null && namespaceURI.length() > 0)
994                                    path.append('[').append(namespaceURI).append(']');
995    
996                            // Append the local name
997                            path.append(localName);
998                    }
999    
1000                    /**
1001                     * "Pops" the last pushed path element off the "stack" by re-adjusting
1002                     * the {@link StringBuilder}'s length to what it was before the last
1003                     * element was appended.
1004                     * <p/>
1005                     * This effectively chops the last element off the path without doing a
1006                     * more costly {@link StringBuilder#delete(int, int)} operation that
1007                     * would incur a call to
1008                     * {@link System#arraycopy(Object, int, Object, int, int)} by simply
1009                     * adjusting a single <code>int</code> counter inside of
1010                     * {@link StringBuilder}.
1011                     */
1012                    public void pop() {
1013                            // Clear the hash code cache first to be safe.
1014                            hashCode = 0;
1015    
1016                            // Get the length before the last insertion
1017                            Integer lastLength = lengthList.remove(lengthList.size() - 1);
1018    
1019                            // 'Pop' the last insertion by cropping the length to exclude it.
1020                            path.setLength(lastLength);
1021                    }
1022            }
1023    }