2015-06-30

Using libxml for XPath

With the proliferation of presentation of data using XML, I find parsing the data is not easy as more and more ingredients (like attributes, name spaces) are introduced.  Up to now, I still find using XPath syntax to represent XML nodes is the best because it simplifies the complicated data hierarchy structure into the conventional slash form, like:
/A/B/C
to select the C element in the XML
<A>
  <B>
    <C/>
  </B>
</A>
(example borrowed from https://en.wikipedia.org/wiki/XPath)

Although I have used Java's built-in XPath features previously, when I have a C project, I need to resort to external library for the job.  I finally choose libxml.  One of the headaches in libxml is its memory management because otherwise you will induce memory leakage easily.
This document describes my learning.

This source code of my example is xpath_demo.c and the full listing is included in another post (link).

Compilation
Most of the installation of libxml is at /usr/local, therefore the sample program xpath_demo.c is compiled with the following switches:
cc -o xpath_demo -L/usr/local/lib -R/usr/local/lib -lxml2 -I/usr/local/include/libxml2 xpath_demo.c

Program Structure
The program has only two functions, the main function (which includes most of the logic) and register_namespaces (which is copied from libxml site for the name space registration)

Program Usage
The simplest usage is:
xpath_demo xml_filename xpath_expression
If there is name space, then the usage will be:
xpath_demo xml_filename xpath_expression name_space_list

Pseudo Codes

Invoke libxml function
Input/Output
Outstanding Object
xmlParseFile
Input: xml filename
Output: xmlDocPtr
xmlDocPtr
xmlPathNewContext
Input: xmlDocPtr
Output: xmlXPathContextPtr
xmlDocPtr
xmlXPathContextPtr
xmlXPathRegisterNs (only applicable for xml with namespace)
Input: xmlXPathContextPtr
namespace_prefix
namespace_URL

xmlXPathEvalExpression
Input: XPath_Expression,
xmlXPathContextPtr
Output: xmlXPathObjectPtr
xmlDocPtr
xmlXPathContextPtr
xmlXPathObjectPtr
xmlXPathFreeContext

xmlDocPtr
xmlXPathObjectPtr
Check if xmlXPathNodeSetIsEmpty
Input: xmlXPathObjectPtr->nodesetval

Retrieve the node:
xmlXPathObjectPtr ->nodesetval->nodeTab[0]


Retrieve the text of the node
xmlNodeGetContent
Input: xmlNode *
Output: xmlChar *
xmlDocPtr
xmlXPathObjectPtr
xmlChar * node_text
xmlFree (node_text)
xmlXPathFreeObject(xmlXPathObjectPtr)

xmlDocPtr
Final Clean up
xmlFreeDoc(xmlDocPtr);
xmlCleanupParser();

Nil

The "simplified" print out of various inputs are shown as follows:
cat data.xml
<?xml version='1.0'?>
<Envelope>
<Header>Header_Text</Header>
<Body attribute1='funny'>
<Field1>Value1</Field1>
<Field2>Value2</Field2>
</Body>
</Envelope>

xpath_demo error.xml /FIELD1
I/O warning : failed to load external entity "error.xml"
Error: Document not parsed successfully.

xpath_demo data.xml /Envelope/Body
node-text: "
Value1
Value2
"
Remark: According to the specification, the text of node includes all the text of its daughter nodes as well.

xpath_demo data.xml /Envelope/Body/Field1
node-text: "Value1"

xpath_demo data.xml /Envelope/Body/Field3
Empty

Cases with Name Space
Personally I do not like name space in XML because it is awkward.  Anway, libxml does support it, with an additional step to register the name space list.

An XML file (ns_data.xml) with name space is shown below:

<?xml version='1.0'?>
<Envelope xmlns:ns1='http://www.domain.com/ns/sample'>
<Header>Header_Text</Header>
<Body name='value'>
<ns1:Field1>Value1 in ns1</ns1:Field1>
<Field1>Value1 without NS<Field1>
</Body>
</Envelope>

A nameapace ns1 is defined in the root element <Envelope>.  You can see there are two tags with name Field1, one of which with name space ns1.  They are can accessed separately as follows:

xpath_demo ns_data.xml /Envelope/Body/ns1:Field1 ns1=http://www.domain.com/ns/sample
node-text: "Value1 in ns1"

xpath_demo ns_data.xml /Envelope/Body/Field1 ns1=http://www.domain.com/ns/sample

node-text: "Value1 without NS"

xpath_demo.c

/*
File : xpath_demo.c
Description: A demo C program to print the text of a node from a xpath expression
Usage: xpath_demo.c xml_file xpath_expression
Dependence: libxml2
How to make:
cc -o xpath_demo -L/usr/local/lib -R/usr/local/lib -lxml2 -I/usr/local/include/libxml2 xpath_demo.c
*/

#include <libxml/parser.h>
#include <libxml/xpath.h>
#include <libxml/xpathInternals.h>   /* for function xmlXPathRegisterNs */
#include <assert.h>

/* Function Prototype */
int  register_namespaces(xmlXPathContextPtr xpathCtx, const xmlChar* nsList);

/* ===================================================================== */
int main(int argc, char **argv) {

xmlDocPtr doc;
/* xmlNodeSetPtr nodeset; */
xmlXPathObjectPtr result;
xmlNode *node;
xmlChar *node_text;
xmlXPathContextPtr context;
char *filename;
xmlChar *xpath_expression;
xmlChar *nsList;

if ((argc != 3) && (argc != 4)) {
  fprintf(stderr, "Usage: %s xml_file xpath_expression  [<known-ns-list>]\n", argv[0]);
  fprintf(stderr, "where <known-ns-list> is a list of known namespaces\n");
  fprintf(stderr, "in \"<prefix1>=<href1> <prefix2>=href2> ...\" format\n");
  return(1);
  }
filename = argv[1];
xpath_expression = (xmlChar*) argv[2];

fprintf (stderr, "DEBUG: LIBXML_VERSION is " LIBXML_VERSION_STRING "\n");

doc = xmlParseFile(filename);
if (doc == NULL ) {
  fprintf(stderr, "Error: Document not parsed successfully.\n");
  xmlCleanupParser();
  return 1;
  }

context = xmlXPathNewContext(doc);
if (context == NULL) {
  fprintf(stderr, "Error in xmlXPathNewContext\n");
  xmlFreeDoc(doc);
  xmlCleanupParser();
  return 2;
  }

if (argc == 4) {
  nsList = (xmlChar*) argv[3];
  if (register_namespaces(context, nsList) < 0) {
    fprintf(stderr,"Error: failed to register namespaces list \"%s\"\n", nsList);
    xmlXPathFreeContext(context);
    xmlFreeDoc(doc);
    xmlCleanupParser();
    return 3;
    }
  }
result = xmlXPathEvalExpression(xpath_expression, context);
xmlXPathFreeContext (context);
if (result == NULL) {
  fprintf(stderr, "Error in xmlXPathEvalExpression\n");
  xmlFreeDoc(doc);
  xmlCleanupParser();
  return 4;
  }

/*
xmlXPathEvalExpression() call returns a set of ALL the nodes that match the expression
We are only interested in the first node returned
*/

if (xmlXPathNodeSetIsEmpty(result->nodesetval)) {
  printf ("Empty\n");
  xmlXPathFreeObject(result);
  xmlFreeDoc(doc);
  xmlCleanupParser();
  return 5;
  }

/* Retrieve the data */
node = result->nodesetval->nodeTab[0];
node_text = xmlNodeGetContent(node);
/* xmlNodeGetContent retrieves the text values of all children too.  This is correct */

fprintf (stderr, "DEBUG: node-type: %d node-name: %s\n" ,node->type, node->name);
xmlAttr *attr = node->properties;
while ( attr ) {
  fprintf (stderr, "DEBUG: attribute-name:%s attribute-value:%s\n" , attr->name, attr->children->content);
  attr = attr->next;
  } /* while */
printf ("node-text: \"%s\"\n", node_text);
xmlFree (node_text);
xmlXPathFreeObject(result);

/* Final clean up */
xmlFreeDoc(doc);
xmlCleanupParser();
return (0);
} /* main */

/**************************************************************************************/
/* The following fucnction is extracted from http://www.xmlsoft.org/examples/xpath1.c */
/**************************************************************************************/

/**
 * register_namespaces:
 * @xpathCtx:           the pointer to an XPath context.
 * @nsList:             the list of known namespaces in
 *                      "<prefix1>=<href1> <prefix2>=href2> ..." format.
 *
 * Registers namespaces from @nsList in @xpathCtx.
 *
 * Returns 0 on success and a negative value otherwise.
 */
int
register_namespaces(xmlXPathContextPtr xpathCtx, const xmlChar* nsList) {
    xmlChar* nsListDup;
    xmlChar* prefix;
    xmlChar* href;
    xmlChar* next;

    assert(xpathCtx);
    assert(nsList);

    nsListDup = xmlStrdup(nsList);
    if(nsListDup == NULL) {
        fprintf(stderr, "Error: unable to strdup namespaces list\n");
        return(-1);
    }

    next = nsListDup;
    while(next != NULL) {
        /* skip spaces */
        while((*next) == ' ') next++;
        if((*next) == '\0') break;

        /* find prefix */
        prefix = next;
        next = (xmlChar*)xmlStrchr(next, '=');
        if(next == NULL) {
            fprintf(stderr,"Error: invalid namespaces list format\n");
            xmlFree(nsListDup);
            return(-1);
        }
        *(next++) = '\0';

        /* find href */
        href = next;
        next = (xmlChar*)xmlStrchr(next, ' ');
        if(next != NULL) {
            *(next++) = '\0';
        }
        /* do register namespace */
        if(xmlXPathRegisterNs(xpathCtx, prefix, href) != 0) {
            fprintf(stderr,"Error: unable to register NS with prefix=\"%s\" and href=\"%s\"\n", prefix, href);
            xmlFree(nsListDup);
            return(-1);
        }
    }

    xmlFree(nsListDup);
    return(0);
}