On a recent project, I had a very large XML file which I needed to manipulate. Using the normal DOM method for parsing the XML was very slow. For a file with 8,000 Users, the DOM method took almost 10 minutes. This was clearly unacceptable, and I knew I could improve the speed.

That is when I stumbled upon the VTD-XML Java library. It took me quite a long time to get my head around the cursor concept and implementation, but I finally made it to the end of the road. Replacing my DOM code with the VTD-XML code (example below) ended up increasing the speed of XML parsing and manipulation by 8 times.

Find out more information about the VTD-XML Java library here.

XML files

Original XML file

<batchResponse xmlns="urn:oasis:names:tc:DSML:2:0:core" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <searchResultEntry dn="cn=karble,ou=Users,ou=Active,o=corp">
        <attr name="mail">
            <value>TEST_1_Karina.Bleidere@belkast.com</value>
            <value>TEST_2_Karina.Bleidere@belkast.com</value>
        </attr>
        <attr name="workforceID">
            <value>12345</value>
        </attr>
        <attr name="sn">
            <value>Bleidere</value>
        </attr>
        <attr name="co">
            <value>Latvia</value>
        </attr>
        <attr name="fullName">
            <value>Karīna Bleidere</value>
        </attr>
        <attr name="givenName">
            <value>Karina</value>
        </attr>
        <attr name="loginActivationTime">
            <value>20180924000000Z</value>
        </attr>
    </searchResultEntry>

Updated XML file

<batchResponse export-count="1" export-date="November 12, 2024 7:59:42 PM EST" export-ldap-date="20241112195942Z" search-filter="workforceID=12345" xmlns="urn:oasis:names:tc:DSML:2:0:core" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <searchResultEntry src-dn="\belkast\corp\Active\Users\karble" object-count="1 of 1" dn="cn=karble,ou=Users,ou=Active,o=corp">
        <attr name="mail" count="2">
            <value>TEST_1_Karina.Bleidere@belkast.com</value>
            <value>TEST_2_Karina.Bleidere@belkast.com</value>
        </attr>
        <attr name="workforceID">
            <value>12345</value>
        </attr>
        <attr name="sn">
            <value>Bleidere</value>
        </attr>
        <attr name="co">
            <value>Latvia</value>
        </attr>
        <attr name="fullName">
            <value convert-from="Karīna Bleidere" xsi:type="xsd:base64Binary">S2FyxKtuYSBCbGVpZGVyZQ==</value>
        </attr>
        <attr name="givenName">
            <value>Karina</value>
        </attr>
        <attr name="loginActivationTime">
            <value convert-from="20180924000000Z" format-from="yyyyMMddHHmmss" format-to="yyyy-MM-dd">2018-09-24</value>
        </attr>
    </searchResultEntry>

Javascript function

function checkXML ( thisFile, thisType, thisAttribute, thisSearch, thisFilter, thisDates )
{

  if ( thisFile == undefined || thisType != "dsml" )
  {
    return [];
  }

  try
  {
    fileReader = new FileReader(thisFile);
    vtdGen = new VTDGen();
    vtdGen.parseFile(thisFile, true);
    vtdNav = vtdGen.getNav();
    xm = new XMLModifier(vtdNav);
  }
  catch (e)
  {
    varError.push("[ checkXML, XML file reader ] : " + thisFile + " => " + e);
    return []
  }

  rootPilot = new AutoPilot(vtdNav);
  rootPilot.selectXPath( "count(./*)" );
  count = rootPilot.evalXPathToNumber();
  rootPilot.selectXPath( "/*/*[1]/@dn" );
  varUser = rootPilot.evalXPathToString();
  var varValue = thisSearch.get(varUser)[0].get ("meta-date");
  var varHeader = new Array;
  varHeader.push("");
  varHeader.push('export-count="' + count + '"');
  varHeader.push('export-date="' + new Date().toLocaleString() + '"');
  varHeader.push('export-ldap-date="' + varValue.toArray()[0] + '"');

  thisFilter = java.lang.String(thisFilter).replace("&", "&amp;");
  thisFilter = java.lang.String(thisFilter).replace("<", "&lt;");
  thisFilter = java.lang.String(thisFilter).replace(">", "&gt;");
  thisFilter = java.lang.String(thisFilter).replace('"', "&pos;");
  thisFilter = java.lang.String(thisFilter).replace("'", "&quot;");
  varHeader.push('search-filter="' + thisFilter + '"');

  xm.insertAttribute(varHeader.join(" "));

  userPilot = new AutoPilot(vtdNav);
  userPilot.selectXPath( "/*/*" );
  while (userPilot.evalXPath() != -1 )
  {
    var varTags = new Array();
    xPathNodes = new AutoPilot(vtdNav)
    xPathNodes.selectXPath( "./@dn" );
    varUser = xPathNodes.evalXPathToString();
    var varValue = thisSearch.get(varUser)[0].get ("meta-edir");
    varTags.push("");
    varTags.push('src-dn="' + replaceInMe(varValue.toArray()[0], "\\\\", "\\") + '"');
    xPathNodes.selectXPath("count(preceding-sibling::*)");
    count =  Number(xPathNodes.evalXPathToString()) + 1 + " of " + varSearch.size();
    varTags.push('object-count="' + count + '"');
    xm.insertAttribute(varTags.join(" "));
  }

  attrPilot = new AutoPilot(vtdNav);
  attrPilot.selectXPath( "/*/*/*" );

  while ( (result = attrPilot.evalXPath()) != -1 )
  {
    var varTags = new Array();
    varTags.push("");
    xPathNodes = new AutoPilot(vtdNav)
    xPathNodes.selectXPath("count(./*)");
    varCount = xPathNodes.evalXPathToNumber();
    if (varCount > 1)
    {
      varTags.push('count="' + varCount + '"');
      xm.insertAttribute(varTags.join(" "));
    }
  }

  datePilot = new AutoPilot(vtdNav);
  datePilot.selectXPath( "/*/*/attr/value" );

  while ( (result = datePilot.evalXPath()) != -1 )
  {
    var varTags = new Array();
    varTags.push("");
    var varMe = undefined;
    xPathNodes = new AutoPilot(vtdNav)
    xPathNodes.selectXPath("../@name");
    var varLDAPAttrName = xPathNodes.evalXPathToString();
    xPathNodes1 = new AutoPilot(vtdNav)
    xPathNodes1.bind(vtdNav);
    xPathNodes1.selectXPath("./text()");
    varValue = xPathNodes1.evalXPathToString();
    var varContains = thisDates.get(varLDAPAttrName);
    if ( varContains && varContains.length > 0 )
      {
        var varIn = varContains[0].split("\\|")[0];
        var varOut = varContains[0].split("\\|")[1];
        var varMe = convertDate ( String(varValue), varIn, varOut );
        varTags.push('convert-from="' + varValue + '"');
        varTags.push('format-from="' + varIn + '"');
        varTags.push('format-to="' + varOut + '"');
      }

    if (!com.novell.ldap.util.Base64.isLDIFSafe(varValue))
      {
        var varMe = com.novell.ldap.util.Base64.encode(varValue);
        varTags.push('convert-from="' + varValue + '"');
        varTags.push('xsi:type="xsd:base64Binary"');
      }

    if ( varMe && varMe != undefined )
    {
        xm.updateToken(vtdNav.getText(), varMe);
    }

    if ( varTags.length > 0 )
    {
      xm.insertAttribute(varTags.join(" "));
    }
  }

  xm.output( thisFile );

}