001 /** 002 * Copyright 2011 The Buzz Media, LLC 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 package com.thebuzzmedia.sjxp; 017 018 import java.io.IOException; 019 import java.io.InputStream; 020 import java.io.UnsupportedEncodingException; 021 import java.nio.charset.Charset; 022 import java.util.ArrayList; 023 import java.util.HashMap; 024 import java.util.List; 025 import java.util.Map; 026 027 import org.xmlpull.v1.XmlPullParser; 028 import org.xmlpull.v1.XmlPullParserException; 029 import org.xmlpull.v1.XmlPullParserFactory; 030 031 import com.thebuzzmedia.sjxp.rule.IRule; 032 import com.thebuzzmedia.sjxp.rule.IRule.Type; 033 034 /** 035 * Class used to define a parser that makes parsing using the performance of an 036 * XML Pull Parser with the ease of XPath-like expressions possible. 037 * 038 * <h3>Thread Safety</h3> This class is not thread-safe, however instances of 039 * {@link XMLParser} can safely be re-used to parse multiple files once the 040 * previous parse operation is done. 041 * 042 * @param <T> 043 * The class type of any user-supplied object that the caller wishes 044 * to be passed through from one of the {@link XMLParser}'s 045 * <code>parse</code> methods directly to the handler when an 046 * {@link IRule} matches. This is typically a data storage mechanism 047 * like a DAO or cache used to store the parsed value in some 048 * valuable way, but it can ultimately be anything. If you do not 049 * need to make use of the user object, there is no need to 050 * parameterize the class. 051 * 052 * @author Riyad Kalla (software@thebuzzmedia.com) 053 */ 054 public class XMLParser<T> { 055 /** 056 * Flag used to indicate if debugging output has been enabled by setting the 057 * "sjxp.debug" system property to <code>true</code>. This value will be 058 * <code>false</code> if the "sjxp.debug" system property is undefined or 059 * set to <code>false</code>. 060 * <p/> 061 * This system property can be set on startup with:<br/> 062 * <code> 063 * -Dsjxp.debug=true 064 * </code> or by calling {@link System#setProperty(String, String)} before 065 * this class is loaded. 066 * <p/> 067 * This is <code>false</code> by default. 068 */ 069 public static final Boolean DEBUG = Boolean.getBoolean("sjxp.debug"); 070 071 /** 072 * Flag used to indicate if this parser should be namespace-aware by setting 073 * the "sjxp.namespaces" system property to <code>true</code>. This value 074 * will be <code>true</code> if the "sjxp.namespaces" system property is 075 * undefined. Namespace awareness can only be disabled by setting this 076 * system property to <code>false</code>. 077 * <p/> 078 * <strong>NOTE</strong>: If you intentionally disable namespace awareness, 079 * any {@link IRule} you provide that uses namespace qualified values (e.g. 080 * [http://w3.org/text]book) will fail to match as the parser can no longer 081 * see namespace URIs. 082 * <p/> 083 * This system property can be set on startup with:<br/> 084 * <code> 085 * -Dsjxp.namespaces=true 086 * </code> or by calling {@link System#setProperty(String, String)} before 087 * this class is loaded. 088 * <p/> 089 * This is <code>true</code> by default. 090 */ 091 public static final Boolean ENABLE_NAMESPACES = (System 092 .getProperty("sjxp.namespaces") == null ? Boolean.TRUE : Boolean 093 .getBoolean("sjxp.namespaces")); 094 095 /** 096 * Flag used to indicate if this parser should validate the parsed XML 097 * against the references DTD or XML Schema by setting the "sjxp.validation" 098 * system property to <code>true</code>. This value will be 099 * <code>false</code> if the "sjxp.validation" system property is undefined 100 * or set to <code>false</code>. 101 * <p/> 102 * This system property can be set on startup with:<br/> 103 * <code> 104 * -Dsjxp.validation=true 105 * </code> or by calling {@link System#setProperty(String, String)} before 106 * this class is loaded. 107 * <p/> 108 * This is <code>false</code> by default. 109 */ 110 public static final Boolean ENABLE_VALIDATION = Boolean 111 .getBoolean("sjxp.validation"); 112 113 /** 114 * Prefix to every log message this library logs. Using a well-defined 115 * prefix helps make it easier both visually and programmatically to scan 116 * log files for messages produced by this library. 117 * <p/> 118 * The value is "[sjxp] " (including the space). 119 */ 120 public static final String LOG_MESSAGE_PREFIX = "[sjxp] "; 121 122 /** 123 * Singleton {@link XmlPullParserFactory} instance used to create new 124 * underlying {@link XmlPullParser} instances for each instance of 125 * {@link XMLParser}. 126 */ 127 public static final XmlPullParserFactory XPP_FACTORY; 128 129 /** 130 * Static initializer used to init the {@link XmlPullParserFactory} with the 131 * configured namespace and validation settings. 132 */ 133 static { 134 if (DEBUG) 135 log("Debug output ENABLED"); 136 137 try { 138 XPP_FACTORY = XmlPullParserFactory.newInstance(); 139 140 // Configure pull parser features 141 XPP_FACTORY.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, 142 ENABLE_NAMESPACES); 143 XPP_FACTORY.setFeature(XmlPullParser.FEATURE_VALIDATION, 144 ENABLE_VALIDATION); 145 146 if (DEBUG) 147 log("XmlPullParserFactory configured [namespaces=%s, validation=%s]", 148 ENABLE_NAMESPACES, ENABLE_VALIDATION); 149 } catch (XmlPullParserException e) { 150 throw new RuntimeException( 151 "An exception occurred while calling XmlPullParserFactory.newInstance(). A library providing the impl of the XML Pull Parser spec (e.g. XPP3 or Android SDK) must be available at runtime.", 152 e); 153 } 154 } 155 156 /** 157 * Helper method used to ensure a message is loggable before it is logged 158 * and then pre-pend a universal prefix to all log messages generated by 159 * this library to make the log entries easy to parse visually or 160 * programmatically. 161 * <p/> 162 * If a message cannot be logged (logging is disabled) then this method 163 * returns immediately. 164 * <p/> 165 * <strong>NOTE</strong>: Because Java will auto-box primitive arguments 166 * into Objects when building out the <code>params</code> array, care should 167 * be taken not to call this method with primitive values unless 168 * {@link #DEBUG} is <code>true</code>; otherwise the VM will be spending 169 * time performing unnecessary auto-boxing calculations. 170 * 171 * @param message 172 * The log message in <a href= 173 * "http://download.oracle.com/javase/6/docs/api/java/util/Formatter.html#syntax" 174 * >format string syntax</a> that will be logged. 175 * @param params 176 * The parameters that will be swapped into all the place holders 177 * in the original messages before being logged. 178 * 179 * @see #LOG_MESSAGE_PREFIX 180 */ 181 protected static void log(String message, Object... params) { 182 if (DEBUG) 183 System.out.printf(LOG_MESSAGE_PREFIX + message + '\n', params); 184 } 185 186 private String toStringCache; 187 private boolean continueParsing = true; 188 189 private Location location; 190 private XmlPullParser xpp; 191 192 private Map<Integer, List<IRule<T>>> tagRuleMap; 193 private Map<Integer, List<IRule<T>>> attrRuleMap; 194 private Map<Integer, List<IRule<T>>> charRuleMap; 195 196 /** 197 * Create a new parser that uses the given {@link IRule}s when parsing any 198 * XML content. 199 * 200 * @param rules 201 * The rules applied to any parsed content. 202 * 203 * @throws IllegalArgumentException 204 * if <code>rules</code> is <code>null</code> or empty. 205 * @throws XMLParserException 206 * if the {@link #XPP_FACTORY} is unable to create a new 207 * {@link XmlPullParser} instance and throws an exception. 208 */ 209 public XMLParser(IRule<T>... rules) throws IllegalArgumentException, 210 XMLParserException { 211 if (rules == null || rules.length == 0) 212 throw new IllegalArgumentException( 213 "rules cannot be null or empty, you must provide at least 1 rule to execute otherwise parsing will do nothing."); 214 215 location = new Location(); 216 217 try { 218 xpp = XPP_FACTORY.newPullParser(); 219 } catch (XmlPullParserException e) { 220 throw new XMLParserException( 221 "An exception occurred while trying to create a new XmlPullParser instance using the XmlPullParserFactory.", 222 e); 223 } 224 225 // Load all the rules 226 initRules(rules); 227 } 228 229 /** 230 * Overridden to provide a nicely formatted representation of the parser for 231 * easy debugging. 232 * <p/> 233 * As an added bonus, since {@link XMLParser}s are intended to be immutable, 234 * the result of <code>toString</code> is cached on the first call and the 235 * cache returned every time to avoid re-computing the completed 236 * {@link String}. 237 * 238 * @return a nicely formatted representation of the parser for easy 239 * debugging. 240 */ 241 @Override 242 public synchronized String toString() { 243 if (toStringCache == null) { 244 toStringCache = this.getClass().getName() + "[attributeRules=" 245 + attrRuleMap + ", characterRules=" + charRuleMap + "]"; 246 } 247 248 return toStringCache; 249 } 250 251 /** 252 * Used to indicate to the parser that you would like it to stop parsing. 253 * <p/> 254 * Internally the parser uses a simple <code>boolean</code> to indicate if 255 * it should keep parsing. A call to this method sets the boolean value to 256 * <code>false</code> which the parser checks at the next parse event and 257 * then stops. 258 * <p/> 259 * This is a safe operation that simply flips a flag to tell the underlying 260 * {@link XmlPullParser} to stop working after it's done with its current 261 * parse event and return from whichever <code>parse</code> method was 262 * called. 263 */ 264 public void stop() { 265 continueParsing = false; 266 } 267 268 /** 269 * Parse the XML out of the given stream matching the {@link IRule}s 270 * provided when the {@link XMLParser} was instantiated. 271 * <p/> 272 * The underlying {@link XmlPullParser} will attempt to determine the 273 * stream's encoding based on the pull parser spec or fall back to a default 274 * of UTF-8. 275 * <p/> 276 * This class will make no attempt at closing the given {@link InputStream}, 277 * the caller must take care to clean up that resource. 278 * <h3>Stopping Parsing</h3> 279 * Parsing can be safely stopped by calling {@link #stop()}. This allows 280 * {@link IRule} implementations control over stopping parsing, for example, 281 * if an arbitrary threshold is hit. A followup call to any of the 282 * <code>parse</code> methods will reset the stopped state. 283 * 284 * @param source 285 * The stream that XML content will be read out of. 286 * 287 * @throws IllegalArgumentException 288 * if <code>source</code> is <code>null</code>. 289 * @throws XMLParserException 290 * if any error occurs with the underlying stream during parsing 291 * of if the XML content itself is malformed and the underlying 292 * pull parser cannot parse it. 293 */ 294 public void parse(InputStream source) throws IllegalArgumentException, 295 XMLParserException { 296 try { 297 parse(source, null, null); 298 } catch (UnsupportedEncodingException e) { 299 // no-op, this should never happen as null is a valid encoding. 300 } 301 } 302 303 /** 304 * Parse the XML out of the given stream matching the {@link IRule}s 305 * provided when the {@link XMLParser} was instantiated. 306 * <p/> 307 * The underlying {@link XmlPullParser} will attempt to determine the 308 * stream's encoding based on the pull parser spec or fall back to a default 309 * of UTF-8. 310 * <p/> 311 * This class will make no attempt at closing the given {@link InputStream}, 312 * the caller must take care to clean up that resource. 313 * <h3>Stopping Parsing</h3> 314 * Parsing can be safely stopped by calling {@link #stop()}. This allows 315 * {@link IRule} implementations control over stopping parsing, for example, 316 * if an arbitrary threshold is hit. A followup call to any of the 317 * <code>parse</code> methods will reset the stopped state. 318 * 319 * @param source 320 * The stream that XML content will be read out of. 321 * @param userObject 322 * The user-supplied object passed through from this parse method 323 * to the matching {@link IRule}'s <code>handleXXX</code> method 324 * when a match is found, or <code>null</code> if no user object 325 * is needed. Passing through a user-object is just meant as a 326 * convenience for giving the handler methods on the 327 * {@link IRule}'s access to objects like DAOs that can be used 328 * to persist or process parsed data easily. 329 * 330 * @throws IllegalArgumentException 331 * if <code>source</code> is <code>null</code>. 332 * @throws XMLParserException 333 * if any error occurs with the underlying stream during parsing 334 * of if the XML content itself is malformed and the underlying 335 * pull parser cannot parse it. 336 */ 337 public void parse(InputStream source, T userObject) 338 throws IllegalArgumentException, XMLParserException { 339 try { 340 parse(source, null, userObject); 341 } catch (UnsupportedEncodingException e) { 342 // no-op, this should never happen as null is a valid encoding. 343 } 344 } 345 346 /** 347 * Parse the XML out of the given stream (producing content matching the 348 * given encoding) matching the {@link IRule}s provided when the 349 * {@link XMLParser} was instantiated. 350 * <p/> 351 * This class will make no attempt at closing the given {@link InputStream}, 352 * the caller must take care to clean up that resource. 353 * <h3>Stopping Parsing</h3> 354 * Parsing can be safely stopped by calling {@link #stop()}. This allows 355 * {@link IRule} implementations control over stopping parsing, for example, 356 * if an arbitrary threshold is hit. A followup call to any of the 357 * <code>parse</code> methods will reset the stopped state. 358 * 359 * @param source 360 * The stream that XML content will be read out of. 361 * @param encoding 362 * The character encoding (e.g. "UTF-8") of the data from the 363 * given stream. If the encoding is not known, passing 364 * <code>null</code> or calling {@link #parse(InputStream)} 365 * instead will allow the underlying {@link XmlPullParser} to try 366 * and automatically determine the encoding. 367 * 368 * @throws IllegalArgumentException 369 * if <code>source</code> is <code>null</code>. 370 * @throws UnsupportedEncodingException 371 * if <code>encoding</code> represents an encoding name that is 372 * not recognized by {@link Charset#isSupported(String)} 373 * @throws XMLParserException 374 * if any error occurs with the underlying stream during parsing 375 * of if the XML content itself is malformed and the underlying 376 * pull parser cannot parse it. 377 */ 378 public void parse(InputStream source, String encoding) 379 throws IllegalArgumentException, UnsupportedEncodingException, 380 XMLParserException { 381 parse(source, encoding, null); 382 } 383 384 /** 385 * Parse the XML out of the given stream (producing content matching the 386 * given encoding) matching the {@link IRule}s provided when the 387 * {@link XMLParser} was instantiated. 388 * <p/> 389 * This class will make no attempt at closing the given {@link InputStream}, 390 * the caller must take care to clean up that resource. 391 * <h3>Stopping Parsing</h3> 392 * Parsing can be safely stopped by calling {@link #stop()}. This allows 393 * {@link IRule} implementations control over stopping parsing, for example, 394 * if an arbitrary threshold is hit. A followup call to any of the 395 * <code>parse</code> methods will reset the stopped state. 396 * 397 * @param source 398 * The stream that XML content will be read out of. 399 * @param encoding 400 * The character encoding (e.g. "UTF-8") of the data from the 401 * given stream. If the encoding is not known, passing 402 * <code>null</code> or calling {@link #parse(InputStream)} 403 * instead will allow the underlying {@link XmlPullParser} to try 404 * and automatically determine the encoding. 405 * @param userObject 406 * The user-supplied object passed through from this parse method 407 * to the matching {@link IRule}'s <code>handleXXX</code> method 408 * when a match is found, or <code>null</code> if no user object 409 * is needed. Passing through a user-object is just meant as a 410 * convenience for giving the handler methods on the 411 * {@link IRule}'s access to objects like DAOs that can be used 412 * to persist or process parsed data easily. 413 * 414 * @throws IllegalArgumentException 415 * if <code>source</code> is <code>null</code>. 416 * @throws UnsupportedEncodingException 417 * if <code>encoding</code> represents an encoding name that is 418 * not recognized by {@link Charset#isSupported(String)} 419 * @throws XMLParserException 420 * if any error occurs with the underlying stream during parsing 421 * of if the XML content itself is malformed and the underlying 422 * pull parser cannot parse it. 423 */ 424 public void parse(InputStream source, String encoding, T userObject) 425 throws IllegalArgumentException, UnsupportedEncodingException, 426 XMLParserException { 427 if (source == null) 428 throw new IllegalArgumentException("source cannot be null"); 429 if (encoding != null) { 430 // If empty, ensure it is null so XPP gets encoding from XML header 431 if (encoding.trim().length() == 0) 432 encoding = null; 433 // Extra-safe, make sure the provided encoding is valid 434 else if (!Charset.isSupported(encoding)) 435 throw new UnsupportedEncodingException( 436 "Encoding [" 437 + encoding 438 + "] is not a valid charset encoding in this runtime according to Charset.isSupported(encoding)."); 439 } 440 441 try { 442 xpp.setInput(source, encoding); 443 444 if (DEBUG) 445 log("Underlying XmlPullParser input set [type=InputStream, encoding=%s (null is OK), userObject=%s]", 446 xpp.getInputEncoding(), (userObject == null ? "" 447 : userObject)); 448 } catch (XmlPullParserException e) { 449 throw new XMLParserException( 450 "Unable to set the given InputStream (with an optional encoding of '" 451 + encoding 452 + "') as input for the underlying XmlPullParser.", 453 e); 454 } 455 456 try { 457 doParse(userObject); 458 } catch (IOException e) { 459 throw new XMLParserException( 460 "An exception occurred while parsing the given source, the XML document may be malformed.", 461 e); 462 } catch (XmlPullParserException e) { 463 throw new XMLParserException( 464 "An error with the underlying data stream being parsed occurred.", 465 e); 466 } 467 } 468 469 protected void initRules(IRule<T>... rules) { 470 // calculate a rough optimal size for the rule maps 471 int optSize = (rules.length > 64 ? rules.length * 2 : 64); 472 473 // init the rule maps 474 tagRuleMap = new HashMap<Integer, List<IRule<T>>>(optSize); 475 attrRuleMap = new HashMap<Integer, List<IRule<T>>>(optSize); 476 charRuleMap = new HashMap<Integer, List<IRule<T>>>(optSize); 477 478 // init the rules 479 List<IRule<T>> ruleList = null; 480 481 for (int i = 0, length = rules.length; i < length; i++) { 482 IRule<T> rule = rules[i]; 483 484 switch (rule.getType()) { 485 case TAG: 486 // Get the rule list for this path 487 ruleList = tagRuleMap.get(rule.getLocationPath()); 488 489 // If there wasn't already a rule list, create and add it 490 if (ruleList == null) { 491 ruleList = new ArrayList<IRule<T>>(3); 492 tagRuleMap.put(rule.getLocationPath().hashCode(), ruleList); 493 } 494 break; 495 496 case ATTRIBUTE: 497 // Get the rule list for this path 498 ruleList = attrRuleMap.get(rule.getLocationPath()); 499 500 // If there wasn't already a rule list, create and add it 501 if (ruleList == null) { 502 ruleList = new ArrayList<IRule<T>>(3); 503 attrRuleMap 504 .put(rule.getLocationPath().hashCode(), ruleList); 505 } 506 break; 507 508 case CHARACTER: 509 // Get the rule list for this path 510 ruleList = charRuleMap.get(rule.getLocationPath()); 511 512 // If there wasn't already a rule list, create and add it 513 if (ruleList == null) { 514 ruleList = new ArrayList<IRule<T>>(3); 515 charRuleMap 516 .put(rule.getLocationPath().hashCode(), ruleList); 517 } 518 break; 519 } 520 521 // Add the rule to the list for the given path 522 ruleList.add(rule); 523 } 524 525 if (DEBUG) 526 log("Initialized %d TAG rules, %d ATTRIBUTE rules and %d CHARACTER rules.", 527 tagRuleMap.size(), attrRuleMap.size(), charRuleMap.size()); 528 } 529 530 /** 531 * Uses the underlying {@link XmlPullParser} to begin parsing through the 532 * XML content from the given stream. This method's implementation is 533 * simple, acting like a traffic-cop responding to 534 * {@link XmlPullParser#START_TAG}, {@link XmlPullParser#TEXT}, 535 * {@link XmlPullParser#END_TAG} and {@link XmlPullParser#END_DOCUMENT} 536 * events by calling the appropriate <code>doXXX</code> methods. 537 * <p/> 538 * Developers creating a subclass of {@link XMLParser} are meant to override 539 * one of the {@link #doStartTag(Object)}, {@link #doText(Object)}, 540 * {@link #doEndTag(Object)} and {@link #doEndDocument(Object)} methods to 541 * add custom behavior and not necessarily override this central method. 542 * <h3>Stopping Parsing</h3> 543 * Parsing can be safely stopped by calling {@link #stop()}. This allows 544 * {@link IRule} implementations control over stopping parsing, for example, 545 * if an arbitrary threshold is hit. A followup call to any of the 546 * <code>parse</code> methods will reset the stopped state. 547 * 548 * @param userObject 549 * The user-supplied object passed through from this parse method 550 * to the matching {@link IRule}'s <code>handleXXX</code> method 551 * when a match is found, or <code>null</code> if no user object 552 * is needed. Passing through a user-object is just meant as a 553 * convenience for giving the handler methods on the 554 * {@link IRule}'s access to objects like DAOs that can be used 555 * to persist or process parsed data easily. 556 * 557 * @throws IOException 558 * if an error occurs with reading from the underlying 559 * {@link InputStream} given to one of the public 560 * <code>parse</code> methods. 561 * @throws XmlPullParserException 562 * if an error occurs while parsing the XML content from the 563 * underlying stream; typically resulting from malformed or 564 * invalid XML. 565 */ 566 protected void doParse(T userObject) throws IOException, 567 XmlPullParserException { 568 location.clear(); 569 continueParsing = true; 570 571 if (DEBUG) 572 log("Parsing starting..."); 573 574 long startTime = System.currentTimeMillis(); 575 576 while (continueParsing) { 577 switch (xpp.next()) { 578 case XmlPullParser.START_TAG: 579 doStartTag(userObject); 580 break; 581 582 case XmlPullParser.TEXT: 583 doText(userObject); 584 break; 585 586 case XmlPullParser.END_TAG: 587 doEndTag(userObject); 588 break; 589 590 case XmlPullParser.END_DOCUMENT: 591 continueParsing = false; 592 doEndDocument(userObject); 593 break; 594 } 595 } 596 597 if (DEBUG) { 598 long duration = System.currentTimeMillis() - startTime; 599 log("Parse COMPLETE, elapsed time: %dms (approx %f seconds)", 600 duration, (double) duration / (double) 1000); 601 } 602 } 603 604 /** 605 * Used to process a {@link XmlPullParser#START_TAG} event. 606 * <p/> 607 * By default this updates the internal location state of the parser, 608 * processes all {@link IRule}s of type {@link Type#TAG} and processes all 609 * {@link IRule}s of type {@link Type#ATTRIBUTE} that match the parser's 610 * current location. 611 * 612 * @param userObject 613 * The user-supplied object passed through from this parse method 614 * to the matching {@link IRule}'s <code>handleXXX</code> method 615 * when a match is found, or <code>null</code> if no user object 616 * is needed. Passing through a user-object is just meant as a 617 * convenience for giving the handler methods on the 618 * {@link IRule}'s access to objects like DAOs that can be used 619 * to persist or process parsed data easily. 620 */ 621 protected void doStartTag(T userObject) { 622 // Update parser location 623 location.push(xpp.getName(), xpp.getNamespace()); 624 625 if (DEBUG) 626 log("START_TAG: %s", location); 627 628 // Get the rules for the current path 629 List<IRule<T>> tagRuleList = tagRuleMap.get(location 630 .getCachedHashCode()); 631 List<IRule<T>> attrRuleList = attrRuleMap.get(location 632 .getCachedHashCode()); 633 634 // If there are no rules for the current path, then we are done. 635 if ((tagRuleList == null || tagRuleList.isEmpty()) 636 && (attrRuleList == null || attrRuleList.isEmpty())) 637 return; 638 639 if (DEBUG) 640 log("\t%d TAG rules and %d ATTR rules found for START_TAG...", 641 (tagRuleList == null ? 0 : tagRuleList.size()), 642 (attrRuleList == null ? 0 : attrRuleList.size())); 643 644 // Process the TAG rules 645 if (tagRuleList != null) { 646 for (int i = 0, size = tagRuleList.size(); i < size; i++) { 647 IRule<T> rule = tagRuleList.get(i); 648 649 if (DEBUG) 650 log("\t\tRunning TAG Rule: %s", rule); 651 652 rule.handleTag(this, true, userObject); 653 } 654 } 655 656 // Process the ATTR rules 657 if (attrRuleList != null) { 658 for (int i = 0, size = attrRuleList.size(); i < size; i++) { 659 IRule<T> rule = attrRuleList.get(i); 660 661 if (DEBUG) 662 log("\t\tRunning ATTR Rule: %s", rule); 663 664 String[] attrNames = rule.getAttributeNames(); 665 666 // Be safe, jump to the next rule if this one has no name 667 // entries 668 if (attrNames == null || attrNames.length == 0) 669 continue; 670 671 /* 672 * PERFORMANCE: Generating the substrings is the fastest way to 673 * parse out the matching rules as it shares the same underlying 674 * char[] used to represent the entire location path or 675 * attribute name and just creates a new simple String instance 676 * with modified index/offset values that is GC'ed quickly and 677 * easily (uses a special package-protected String constructor). 678 * 679 * Using regexp to match, splitting the rule or just about any 680 * other approach would have been magnitudes more expensive both 681 * in memory and CPU requirements than doing a simple substring. 682 */ 683 for (int j = 0; j < attrNames.length; j++) { 684 String attrName = attrNames[j]; 685 String localName = null; 686 String namespaceURI = null; 687 688 // Parse the namespaceURI out of the name if necessary 689 if (attrName.charAt(0) == '[') { 690 int endIndex = attrName.indexOf(']'); 691 692 /* 693 * Make sure the rule is valid so we avoid out of bounds 694 * and keep the caller informed when their rules are 695 * busted by failing fast. 696 */ 697 if (endIndex <= 2) 698 throw new XMLParserException( 699 "namespace URI for rule looks to be incomplete or empty for IRule: " 700 + rule); 701 702 namespaceURI = attrName.substring(1, endIndex); 703 } 704 705 int startIndex = (namespaceURI == null ? 0 : namespaceURI 706 .length() + 2); 707 708 /* 709 * Make sure the rule is valid so we avoid out of bounds and 710 * keep the caller informed when their rules are busted by 711 * failing fast. 712 */ 713 if (attrName.length() - startIndex <= 1) 714 throw new XMLParserException( 715 "local name for rule looks to be missing for IRule: " 716 + rule); 717 718 // Parse the local name 719 localName = attrName.substring(startIndex, 720 attrName.length()); 721 722 // Give the parsed attribute value to the matching rule 723 rule.handleParsedAttribute(this, j, 724 xpp.getAttributeValue(namespaceURI, localName), 725 userObject); 726 } 727 } 728 } 729 } 730 731 /** 732 * Used to process a {@link XmlPullParser#TEXT} event. 733 * <p/> 734 * By default this processes all {@link IRule}s of type 735 * {@link Type#CHARACTER} that match the parser's current location. 736 * 737 * @param userObject 738 * The user-supplied object passed through from this parse method 739 * to the matching {@link IRule}'s <code>handleXXX</code> method 740 * when a match is found, or <code>null</code> if no user object 741 * is needed. Passing through a user-object is just meant as a 742 * convenience for giving the handler methods on the 743 * {@link IRule}'s access to objects like DAOs that can be used 744 * to persist or process parsed data easily. 745 */ 746 protected void doText(T userObject) { 747 if (DEBUG) 748 log("TEXT: %s", location); 749 750 // Get the rules for the current path 751 List<IRule<T>> ruleList = charRuleMap.get(location.getCachedHashCode()); 752 753 // If there are no rules for the current path, then we are done. 754 if (ruleList == null || ruleList.isEmpty()) 755 return; 756 757 if (DEBUG) 758 log("\t%d rules found for TEXT...", ruleList.size()); 759 760 String text = xpp.getText(); 761 762 // Give the parsed text to all matching IRules for this path 763 for (int i = 0, size = ruleList.size(); i < size; i++) { 764 IRule<T> rule = ruleList.get(i); 765 766 if (DEBUG) 767 log("\t\tRunning Rule: %s", rule); 768 769 rule.handleParsedCharacters(this, text, userObject); 770 } 771 } 772 773 /** 774 * Used to process a {@link XmlPullParser#END_TAG} event. 775 * 776 * @param userObject 777 * The user-supplied object passed through from this parse method 778 * to the matching {@link IRule}'s <code>handleXXX</code> method 779 * when a match is found, or <code>null</code> if no user object 780 * is needed. Passing through a user-object is just meant as a 781 * convenience for giving the handler methods on the 782 * {@link IRule}'s access to objects like DAOs that can be used 783 * to persist or process parsed data easily. 784 */ 785 protected void doEndTag(T userObject) { 786 // Get the rules for the current path 787 List<IRule<T>> tagRuleList = tagRuleMap.get(location 788 .getCachedHashCode()); 789 790 // If there are no rules for the current path, then we are done. 791 if (tagRuleList != null && !tagRuleList.isEmpty()) { 792 if (DEBUG) 793 log("\t%d TAG rules found for END_TAG...", tagRuleList.size()); 794 795 // Process the TAG rules 796 for (int i = 0, size = tagRuleList.size(); i < size; i++) { 797 IRule<T> rule = tagRuleList.get(i); 798 799 if (DEBUG) 800 log("\t\tRunning TAG Rule: %s", rule); 801 802 rule.handleTag(this, false, userObject); 803 } 804 } 805 806 // Update parser location 807 location.pop(); 808 809 if (DEBUG) 810 log("END_TAG: %s", location); 811 } 812 813 /** 814 * Used to process a {@link XmlPullParser#END_DOCUMENT} event. 815 * <p/> 816 * By default this method simply logs a debug statement if debugging is 817 * enabled, but this stub is provided to make overriding the default 818 * behavior easier if desired. 819 * 820 * @param userObject 821 * The user-supplied object passed through from this parse method 822 * to the matching {@link IRule}'s <code>handleXXX</code> method 823 * when a match is found, or <code>null</code> if no user object 824 * is needed. Passing through a user-object is just meant as a 825 * convenience for giving the handler methods on the 826 * {@link IRule}'s access to objects like DAOs that can be used 827 * to persist or process parsed data easily. 828 */ 829 protected void doEndDocument(T userObject) { 830 if (DEBUG) 831 log("END_DOCUMENT, Parsing COMPLETE"); 832 } 833 834 /** 835 * Simple and fast class used to mock the behavior of a stack in the form of 836 * a string for the purposes of "pushing" and "popping" the parser's current 837 * location within an XML document as it processes START and END_TAG events. 838 * <p/> 839 * Performance is optimized by using a {@link StringBuilder} who's length is 840 * chopped (which just adjusts an <code>int</code> value) to simulate a 841 * "pop" off the top. 842 * <h3>Performance</h3> 843 * As of SJXP 2.0 String object creation and char[] duplication (e.g. 844 * {@link System#arraycopy(Object, int, Object, int, int)}) has been 845 * completely removed and replaced with using simple integer hash codes. 846 * <p/> 847 * The performance improvement is huge over the original toString-based 848 * method of matching {@link IRule}'s <code>locationPath</code>s against the 849 * parser's current location. 850 * 851 * @author Riyad Kalla (software@thebuzzmedia.com) 852 */ 853 class Location { 854 private static final int HASH_CODE_CACHE_SIZE = 512; 855 856 private int hashCode; 857 private Integer[] hashCodeCache; 858 859 private StringBuilder path; 860 private List<Integer> lengthList; 861 862 /** 863 * Creates a new empty location. 864 */ 865 public Location() { 866 hashCode = 0; 867 hashCodeCache = new Integer[HASH_CODE_CACHE_SIZE]; 868 869 path = new StringBuilder(256); 870 lengthList = new ArrayList<Integer>(16); 871 } 872 873 /** 874 * Overridden to calculate the hash code of this location using the 875 * exact same hash code calculation that {@link String#hashCode()} uses. 876 * This allows us to say a <code>String</code> with the content 877 * "/library/book/title" is equal to an instance of this class 878 * representing the same location when doing lookups in a {@link Map}. 879 * <p/> 880 * This method calculates the hash code and then caches it, followup 881 * calls to {@link #push(String, String)} or {@link #pop()} invalidate 882 * the cached hash code allowing it to be recalculated again on the next 883 * call. 884 */ 885 @Override 886 public int hashCode() { 887 /* 888 * If the hash code is already 0 and our path is empty, there is 889 * nothing to compute so the hash code stays 0. Otherwise we drop 890 * into the for-loop and calculate the String-equivalent hash code. 891 */ 892 if (hashCode == 0 && path.length() > 0) { 893 for (int i = 0, length = path.length(); i < length; i++) { 894 hashCode = 31 * hashCode + path.charAt(i); 895 } 896 } 897 898 return hashCode; 899 } 900 901 /** 902 * Used to get a cached {@link Integer} version of the <code>int</code> 903 * {@link #hashCode()} return value. 904 * <p/> 905 * To avoid unnecessary {@link Integer} allocations, this method caches 906 * up to a certain number of {@link Integer} instances, re-using them 907 * every time the same hash code value comes back up and creating new 908 * instances when it doesn't. 909 * <p/> 910 * If a larger number of {@link Integer} instances are created than the 911 * underlying cache can hold, then a new instance will be created and 912 * returned like normal. 913 * <h3>Design</h3> 914 * The reason this works so well for parsing XML is because of the 915 * nested, tag-matching structure of XML. When considering unique paths 916 * inside of an XML doc (e.g. "/library", "/library/book", etc.) there 917 * are typically not that many; maybe 20, 50 or less than a 100 in most 918 * cases. 919 * <p/> 920 * Once the hash code {@link Integer} values for these unique paths is 921 * created and cached, once we re-encounter that path again and again, 922 * we don't need to recreate that hash code {@link Integer}, we can just 923 * use the one from the previous occurrence. 924 * 925 * @return a cached {@link Integer} version of the <code>int</code> 926 * {@link #hashCode()} return value. 927 */ 928 public Integer getCachedHashCode() { 929 // Recalculate the hash code 930 hashCode(); 931 932 // Figure out the index, in our cache, where this value WOULD be. 933 int index = hashCode % hashCodeCache.length; 934 935 // Absolute value only 936 if (index < 0) 937 index = -index; 938 939 // Get the Integer we think represents our value. 940 Integer value = hashCodeCache[index]; 941 942 // If we haven't created an Integer for this value yet, do it now. 943 if (value == null) 944 hashCodeCache[index] = (value = Integer.valueOf(hashCode)); 945 /* 946 * If a collision has occurred and we have filled up our cache 947 * already and the Integer we grabbed doesn't represent our int 948 * value, forget the cache and just create a new Integer the old 949 * fashion way and return it. 950 * 951 * The hope is that the cache is always large enough that we only 952 * ever hit it and have no misses like this. 953 */ 954 else if (hashCode != value.intValue()) 955 value = Integer.valueOf(hashCode); 956 957 return value; 958 } 959 960 /** 961 * Used to clear all the internal state of the location. 962 */ 963 public void clear() { 964 hashCode = 0; 965 hashCodeCache = new Integer[HASH_CODE_CACHE_SIZE]; 966 967 path.setLength(0); 968 lengthList.clear(); 969 } 970 971 /** 972 * "Pushes" a new local name and optional namespace URI onto the "stack" 973 * by appending it to the current location path that represents the 974 * parser's location inside of the XML doc. 975 * 976 * @param localName 977 * The local name of the tag (e.g. "title"). 978 * @param namespaceURI 979 * Optionally, the full qualifying namespace URI for this 980 * tag. 981 */ 982 public void push(String localName, String namespaceURI) { 983 // Clear the hash code cache first to be safe. 984 hashCode = 0; 985 986 // Remember the length before we inserted this last entry 987 lengthList.add(path.length()); 988 989 // Add separator 990 path.append('/'); 991 992 // Add the namespace URI if there is one. 993 if (namespaceURI != null && namespaceURI.length() > 0) 994 path.append('[').append(namespaceURI).append(']'); 995 996 // Append the local name 997 path.append(localName); 998 } 999 1000 /** 1001 * "Pops" the last pushed path element off the "stack" by re-adjusting 1002 * the {@link StringBuilder}'s length to what it was before the last 1003 * element was appended. 1004 * <p/> 1005 * This effectively chops the last element off the path without doing a 1006 * more costly {@link StringBuilder#delete(int, int)} operation that 1007 * would incur a call to 1008 * {@link System#arraycopy(Object, int, Object, int, int)} by simply 1009 * adjusting a single <code>int</code> counter inside of 1010 * {@link StringBuilder}. 1011 */ 1012 public void pop() { 1013 // Clear the hash code cache first to be safe. 1014 hashCode = 0; 1015 1016 // Get the length before the last insertion 1017 Integer lastLength = lengthList.remove(lengthList.size() - 1); 1018 1019 // 'Pop' the last insertion by cropping the length to exclude it. 1020 path.setLength(lastLength); 1021 } 1022 } 1023 }