package maito.datacollecting.dcxml;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;

import javax.xml.XMLConstants;
import javax.xml.namespace.NamespaceContext;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.*;

import maito.datacollecting.Record;

/**
* Constructs a Record object from a valid XML String that specifies
* a record in either Dublin Core 2.0 XML or oai_citeseer
* format (see the integer constants in this class).
* Does the "real work" for both OAICiteseerRecordConstructor and
* DCXMLRecordConstructor.
* 
* @version 1.0
* @author Väinö Ala-Härkönen
* 
*/
public class DCXMLRecordConstructWorker implements ErrorHandler {
    
    private DocumentBuilderFactory factory; 
    private DocumentBuilder builder;   
    
    private Document doc;
    
    private Record rec;
    private int format;
    
    // The XPath paths of fields to be parsed in a specific record format - relative to the PATHS-type
    private static final String[] DCFIELDS = {"dc:title", "dc:creator", "dc:subject", "dc:description", 
        "dc:publisher", "dc:contributor", "dc:date", "dc:type", "dc:format", "dc:identifier", "dc:language", "dc:relation",
        "dc:bibliographicCitation", "dc:issued", "dc:references", "dc:isReferencedBy", "dc:rightsHolder"};
    private static final String[] CITESEERFIELDS = {"oai_citeseer:author/@name", "oai_citeseer:author/affiliation", 
        "oai_citeseer:relation[@type='References']/oai_citeseer:uri", 
        "oai_citeseer:relation[@type='Is Referenced By']/oai_citeseer:uri"};
    
    // The names for the keys to be added to the Record if not same as the field names
    private static final String[] CITESEERNAMES = { "oai_citeseer:authorName", "oai_citeseer:authorAffiliation",
        "oai_citeseer:relationReferences", "oai_citeseer:relationIsReferencedBy"};
    
    /**
     * The Record is in Dublin Core 2.0 XML format
     */
    public static final int RECORD_DCXML = 0;
    /**
     * The Record is in oai_citeseer format
     */
    public static final int RECORD_CITESEER = 1;
    
    // The XPath paths to the actual metadata, index maps to the record type constants
    private static final String PATHS[] = {"/record/metadata/oai_dc:dc/", "/record/metadata/oai_citeseer:oai_citeseer/"};    
    
    /**
     * Constructs a record from the data string. The string must be a string
     * representation of a valid XML document. Root tag should be "record"
     * according to OAI-PMH / oai_citeseer record specifications.
     * @param data
     * The data the Record should be constructed from
     * @param recordFormat
     * Format of the record to be processed. See integer constants in this class.
     * @return
     * A Record ready for transformation
     * @throws IllegalArgumentException
     */
    protected Record constructRecord(String data, int recordFormat) throws IllegalArgumentException {

        if (recordFormat == RECORD_DCXML || recordFormat == RECORD_CITESEER)
            this.format = recordFormat;
        else
            throw new IllegalArgumentException("Illegal record format");
        
        // Create the document
        this.factory = DocumentBuilderFactory.newInstance();
        this.factory.setNamespaceAware(true);
        try {
            this.builder = factory.newDocumentBuilder();            
            this.builder.setErrorHandler(this);
            
            InputStream in = new ByteArrayInputStream(data.getBytes());        
            doc = this.builder.parse(in);
        }
        
        catch(Exception e) {
            System.out.println("Invalid record received. Skipping and continuing.");
            return null;
        }

        // Create the record with an id
        String[] thisID = getNodeContent("/record/header/identifier");
        if (thisID.length > 0) {
            thisID[0] = thisID[0].trim();
            rec = new Record(thisID[0]);
            if (format == RECORD_CITESEER)
                rec.setField("oai_citeseer:identifier", thisID[0]);
        }
        else
            rec = new Record("");
        
        // Add the dc fields to the Record        
        for (int i = 0; i < DCFIELDS.length; i++) {
            addToRecord(DCFIELDS[i], DCFIELDS[i]);
        }
        
        // Add the citeseer fields to the Record
        if (format == RECORD_CITESEER) {
            for (int i = 0; i < CITESEERFIELDS.length; i++) {
                addToRecord(CITESEERFIELDS[i], CITESEERNAMES[i]);
            }            
        }
        return rec;
    }
    
    /**
     * Adds a specified field to the Record being created.
     * @param key
     * The key of the field to be searched for (relative to the PATHS setting)
     * @param name
     * The name of the key to be saved to the Record
     */
    private void addToRecord(String key, String name) {
        String[] values;
        values = getNodeContent(PATHS[format] + key);
        if (values != null) {
            for (int i = 0; i < values.length; i++) {
                if (values[i] != null) {
                    rec.setField(name, values[i].trim());
                    // System.out.println(name + ": " + values[i]);
                }
            }
        }
    }
    
    /**
     * Returns the text content of an element and its subelements. The current content in 
     * <code>data</code> must be a string representation of a valid XML document. 
     * @return
     * The content of the nodes as a string array. <code>null</code> if the value or node is not available.
     */
    private String[] getNodeContent(String xpathExpression) {
        
        String[] values = null;
        
        try {
            XPath xpath = XPathFactory.newInstance().newXPath();
            NamespaceContext NSContext = new NamespaceContextImpl();
            xpath.setNamespaceContext(NSContext);            
            NodeList list = (NodeList) xpath.evaluate(xpathExpression, doc, XPathConstants.NODESET);
            
            Node node;
            values = new String[list.getLength()];
            
            for (int j = 0; j < list.getLength(); j++) {
                node = list.item(j);
                if(node != null) {
                    NodeList children = node.getChildNodes();
                    
                    for ( int i = 0 ; i < children.getLength() ; i++ ) {
                        if(children.item(i).getNodeType() == Node.TEXT_NODE) {
                            values[j] = children.item(i).getNodeValue();
                        }
                    }
                }                
            }
        }
        catch(Exception e) { // This will never happen as long as the hardcoded xpath expressions are correct
        }
        
        return values;
    }
    
    public void error(SAXParseException exception){
        //nothing to do here
    }
    
    public void fatalError(SAXParseException exception) {
        //nothing to do here
    }
    
    public void warning(SAXParseException exception) {
        //nothing to do here
    }
    
    /**
     * An inner class for handling namespace contexts with XPath
     */
    private class NamespaceContextImpl implements NamespaceContext{
        
        public String uris[] = {"http://www.openarchives.org/OAI/2.0/oai_dc/", 
                "http://purl.org/dc/elements/1.1/", "http://copper.ist.psu.edu/oai/oai_citeseer/"};
        public String prefixes[] = {"oai_dc", "dc", "oai_citeseer"};
        
        public String getNamespaceURI(String thisPrefix){ 
            for (int i = 0; i < prefixes.length; i++) {
                if (thisPrefix.equals(prefixes[i]))
                    return uris[i];
            }
            return XMLConstants.NULL_NS_URI;            
        }
        
        public String getPrefix(String thisURI){
            for (int i = 0; i < uris.length; i++) {
                if (thisURI.equals(uris[i]))
                    return prefixes[i];
            }
            return XMLConstants.DEFAULT_NS_PREFIX;
        }
        
        public Iterator getPrefixes(String thisURI){
            for (int i = 0; i < uris.length; i++) {
                if (thisURI.equals(uris[i])) {
                    ArrayList list = new ArrayList();
                    list.add(prefixes[i]);
                    return list.iterator();
                }
            }
            return new ArrayList().iterator();
        }
                
    }
    
    /**
     * This method is needed to reach and test the inner class. It does nothing useful.
     */
    protected boolean testNamespaceImpl () {
        
        NamespaceContext NSContext = new NamespaceContextImpl();
        boolean ok1 =  NSContext.getNamespaceURI("dc").equals("http://purl.org/dc/elements/1.1/");
        boolean ok2 = NSContext.getPrefix("http://copper.ist.psu.edu/oai/oai_citeseer/").equals("oai_citeseer");
        String temp = (String) NSContext.getPrefixes("http://www.openarchives.org/OAI/2.0/oai_dc/").next();
        boolean ok3 = temp.equals("oai_dc");
        boolean ok4 = NSContext.getNamespaceURI("foo").equals(XMLConstants.NULL_NS_URI);
        boolean ok5 = NSContext.getPrefix("http://foo.bar/").equals(XMLConstants.DEFAULT_NS_PREFIX);
        boolean ok6 = (NSContext.getPrefixes("http://foo.bar").hasNext() == false);
        boolean implOK = ok1 & ok2 & ok3 & ok4 & ok5 & ok6;
        return implOK;
    }    

}
