On a recent project, I had a very large XML file which I needed to manipulate. Using the normal DOM method for parsing the XML was very slow. For a file with 8,000 Users, the DOM method took almost 10 minutes. This was clearly unacceptable, and I knew I could improve the speed.
That is when I stumbled upon the VTD-XML Java library. It took me quite a long time to get my head around the cursor concept and implementation, but I finally made it to the end of the road. Replacing my DOM code with the VTD-XML code (example below) ended up increasing the speed of XML parsing and manipulation by 8 times.
Find out more information about the VTD-XML Java library here.
XML files
Original XML file
<batchResponse xmlns="urn:oasis:names:tc:DSML:2:0:core" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<searchResultEntry dn="cn=karble,ou=Users,ou=Active,o=corp">
<attr name="mail">
<value>TEST_1_Karina.Bleidere@belkast.com</value>
<value>TEST_2_Karina.Bleidere@belkast.com</value>
</attr>
<attr name="workforceID">
<value>12345</value>
</attr>
<attr name="sn">
<value>Bleidere</value>
</attr>
<attr name="co">
<value>Latvia</value>
</attr>
<attr name="fullName">
<value>Karīna Bleidere</value>
</attr>
<attr name="givenName">
<value>Karina</value>
</attr>
<attr name="loginActivationTime">
<value>20180924000000Z</value>
</attr>
</searchResultEntry>
Updated XML file
<batchResponse export-count="1" export-date="November 12, 2024 7:59:42 PM EST" export-ldap-date="20241112195942Z" search-filter="workforceID=12345" xmlns="urn:oasis:names:tc:DSML:2:0:core" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<searchResultEntry src-dn="\belkast\corp\Active\Users\karble" object-count="1 of 1" dn="cn=karble,ou=Users,ou=Active,o=corp">
<attr name="mail" count="2">
<value>TEST_1_Karina.Bleidere@belkast.com</value>
<value>TEST_2_Karina.Bleidere@belkast.com</value>
</attr>
<attr name="workforceID">
<value>12345</value>
</attr>
<attr name="sn">
<value>Bleidere</value>
</attr>
<attr name="co">
<value>Latvia</value>
</attr>
<attr name="fullName">
<value convert-from="Karīna Bleidere" xsi:type="xsd:base64Binary">S2FyxKtuYSBCbGVpZGVyZQ==</value>
</attr>
<attr name="givenName">
<value>Karina</value>
</attr>
<attr name="loginActivationTime">
<value convert-from="20180924000000Z" format-from="yyyyMMddHHmmss" format-to="yyyy-MM-dd">2018-09-24</value>
</attr>
</searchResultEntry>
Javascript function
function checkXML ( thisFile, thisType, thisAttribute, thisSearch, thisFilter, thisDates )
{
if ( thisFile == undefined || thisType != "dsml" )
{
return [];
}
try
{
fileReader = new FileReader(thisFile);
vtdGen = new VTDGen();
vtdGen.parseFile(thisFile, true);
vtdNav = vtdGen.getNav();
xm = new XMLModifier(vtdNav);
}
catch (e)
{
varError.push("[ checkXML, XML file reader ] : " + thisFile + " => " + e);
return []
}
rootPilot = new AutoPilot(vtdNav);
rootPilot.selectXPath( "count(./*)" );
count = rootPilot.evalXPathToNumber();
rootPilot.selectXPath( "/*/*[1]/@dn" );
varUser = rootPilot.evalXPathToString();
var varValue = thisSearch.get(varUser)[0].get ("meta-date");
var varHeader = new Array;
varHeader.push("");
varHeader.push('export-count="' + count + '"');
varHeader.push('export-date="' + new Date().toLocaleString() + '"');
varHeader.push('export-ldap-date="' + varValue.toArray()[0] + '"');
thisFilter = java.lang.String(thisFilter).replace("&", "&");
thisFilter = java.lang.String(thisFilter).replace("<", "<");
thisFilter = java.lang.String(thisFilter).replace(">", ">");
thisFilter = java.lang.String(thisFilter).replace('"', "&pos;");
thisFilter = java.lang.String(thisFilter).replace("'", """);
varHeader.push('search-filter="' + thisFilter + '"');
xm.insertAttribute(varHeader.join(" "));
userPilot = new AutoPilot(vtdNav);
userPilot.selectXPath( "/*/*" );
while (userPilot.evalXPath() != -1 )
{
var varTags = new Array();
xPathNodes = new AutoPilot(vtdNav)
xPathNodes.selectXPath( "./@dn" );
varUser = xPathNodes.evalXPathToString();
var varValue = thisSearch.get(varUser)[0].get ("meta-edir");
varTags.push("");
varTags.push('src-dn="' + replaceInMe(varValue.toArray()[0], "\\\\", "\\") + '"');
xPathNodes.selectXPath("count(preceding-sibling::*)");
count = Number(xPathNodes.evalXPathToString()) + 1 + " of " + varSearch.size();
varTags.push('object-count="' + count + '"');
xm.insertAttribute(varTags.join(" "));
}
attrPilot = new AutoPilot(vtdNav);
attrPilot.selectXPath( "/*/*/*" );
while ( (result = attrPilot.evalXPath()) != -1 )
{
var varTags = new Array();
varTags.push("");
xPathNodes = new AutoPilot(vtdNav)
xPathNodes.selectXPath("count(./*)");
varCount = xPathNodes.evalXPathToNumber();
if (varCount > 1)
{
varTags.push('count="' + varCount + '"');
xm.insertAttribute(varTags.join(" "));
}
}
datePilot = new AutoPilot(vtdNav);
datePilot.selectXPath( "/*/*/attr/value" );
while ( (result = datePilot.evalXPath()) != -1 )
{
var varTags = new Array();
varTags.push("");
var varMe = undefined;
xPathNodes = new AutoPilot(vtdNav)
xPathNodes.selectXPath("../@name");
var varLDAPAttrName = xPathNodes.evalXPathToString();
xPathNodes1 = new AutoPilot(vtdNav)
xPathNodes1.bind(vtdNav);
xPathNodes1.selectXPath("./text()");
varValue = xPathNodes1.evalXPathToString();
var varContains = thisDates.get(varLDAPAttrName);
if ( varContains && varContains.length > 0 )
{
var varIn = varContains[0].split("\\|")[0];
var varOut = varContains[0].split("\\|")[1];
var varMe = convertDate ( String(varValue), varIn, varOut );
varTags.push('convert-from="' + varValue + '"');
varTags.push('format-from="' + varIn + '"');
varTags.push('format-to="' + varOut + '"');
}
if (!com.novell.ldap.util.Base64.isLDIFSafe(varValue))
{
var varMe = com.novell.ldap.util.Base64.encode(varValue);
varTags.push('convert-from="' + varValue + '"');
varTags.push('xsi:type="xsd:base64Binary"');
}
if ( varMe && varMe != undefined )
{
xm.updateToken(vtdNav.getText(), varMe);
}
if ( varTags.length > 0 )
{
xm.insertAttribute(varTags.join(" "));
}
}
xm.output( thisFile );
}