/*
* ListRecords.java
* 
* v0.1
*
* 04.11.2005
* 
* This software is released under the GNU GPL license
*/
package maito.datacollecting.oaipmh;

import java.io.*;
import java.net.*;
import org.xml.sax.*;
import org.xml.sax.helpers.XMLReaderFactory;
import java.util.regex.*;


/**
 * A class that represents an OAI-PMH response with the verb <code>ListRecords</code>.
 * The original raw response from the server can be retrieved through an instance of this this class. The response contains only the <code>record</code> tags. 
 * 
 * @author Antti Laitinen
 */
public class ListRecords {

	private URL url;
	private String baseURL;
    
    private ListRecordsContentHandler contentHandler;
    private XMLReader xmlReader;
    
	private InputStream in;
	private String resumptionToken;
	private StringBuffer buffer;
	
    private final int CONNECTION_TIMEOUT = 5000;
    
	/**
	 * Creates a new ListRecords object. The remote OAI-PMH repository is queried with the given parameters.
	 * @param baseURL
	 * The base URL for the repository that is to be read without any parameters. Must not be null.
	 * @param metadataPrefix
     * <code>metadataPrefix</code> in the OAI-PMH request. Must not be null.
	 * @param dateFrom
     * <code>from</code> in the OAI-PMH request. If this is null then no <code>from</code> is included in the OAI-PMH request. This parameter is passed to the request as it is.
	 * @throws OAIPMHException
	 * Thrown if anything fails while reading data from the repository.
	 */
	protected ListRecords(String baseURL, String metadataPrefix,  String dateFrom) throws OAIPMHException {
		
		/*Test parameters*/
		if(baseURL == null) {
        	throw new OAIPMHException("parameter baseURL is null");
        }
        
        if(metadataPrefix == null) {
        	throw new OAIPMHException("parameter metadataPrefix is null");
        }
		
        /*prepare for xml parsing*/
        this.contentHandler = new ListRecordsContentHandler();
        
        try {
            this.xmlReader = XMLReaderFactory.createXMLReader();
            this.xmlReader.setContentHandler(this.contentHandler);
            this.xmlReader.setErrorHandler(this.contentHandler);
        }
        catch(SAXException e) {
            throw new OAIPMHException("couldn't prepare for xml parsing");
        }
        
        /*init other fields*/
        this.baseURL = baseURL;
        
		this.buffer = new StringBuffer();
		
		this.resumptionToken = null;
		
        /*construct the needed URL*/
        try {

            String completeURL = baseURL + "?verb=ListRecords&metadataPrefix=" + metadataPrefix;
            
            if(dateFrom != null) {
                completeURL += "&from=" + dateFrom;
            }
            
            this.url = new URL(completeURL);      
        }
		catch(Exception e) {
			throw new OAIPMHException(e.getMessage());
		}
		
        /*read the data from the server*/
		readFromRepository();
	}
	
	
	/**
	 * Returns a requested amount of characters from the OAI-PMH response. 
     * The characters are returned as a String and extracted from this object's 
     * internal buffer so they will no longer be available. The response contains 
     * only the <code>record</code> tags from the original response.
	 * 
	 * @param chars
	 * The amount of characters that is to be extracted from the response.
     * If the amount of available characters is less than the requested amount 
     * then all available characters are returned.
	 * @return
	 * Returns the requested amount of characters from the response. 
     * If no characters are available or the parameter chars has a negative value, returns null.
	 */
	protected String nextPart(int chars) throws OAIPMHException {
		
        String part = ""; 
        
		if(chars <= 0)
			return null;
		
		if(this.buffer.length() == 0 && this.resumptionToken == null)
			return null;
		
		if (chars > this.buffer.length()) {
            			
            part = this.buffer.toString();
            chars -= this.buffer.length();
            
            this.buffer = new StringBuffer();
            
            while(this.resumptionToken != null && chars > 0) {
                
            	if(this.buffer.length() == 0) {
                	this.readFromRepository();
                }                	
                
                if (chars <= this.buffer.length()) {
                	part += this.buffer.substring(0,chars);
                	this.buffer.delete(0,chars);
                	chars = 0;
                }
                
                else {
                	part += this.buffer.toString();
                	chars -= this.buffer.length();
                	this.buffer = new StringBuffer();
                }                
            }
		}
		else {
			
			part += this.buffer.substring(0,chars);
			this.buffer.delete(0,chars);
		}
		
		return part;
	}
	
	
	/**
     * Retrieves data from <code>url</code> and puts retrieved data into <code>buffer</code>. 
     * @throws OAIPMHException
	 */
	private void readFromRepository() throws OAIPMHException {
		
        boolean parsingSuccessful = false;
        
        try {
            
            while(!parsingSuccessful) {
            
                this.fillBuffer();
           
                /*parse the response xml*/
                InputSource source = new InputSource(new StringReader(new String(this.buffer)));
                try {
                    this.xmlReader.parse(source);
                    parsingSuccessful = true;
                }
                catch(SAXException e) {

                    /* If the xml couldn't be parsed for some reason then the xml response is
                     * omitted and a resumptionToken is searched manually using 
                     * a regular expression. If a resumption token is still not found then an
                     * OAIPMHException is thrown and the harvesting is stopped.
                     */
                    this.resumptionToken = this.searchResumptionTokenFromCorruptXML();
                    
                    if(this.resumptionToken != null) {
                        System.out.println(this.baseURL + " sent an invalid xml response. Some of the data was ignored.\n"
                                + "(details: " + e.getMessage() + ")");
                                                
                        try {
                            this.url = new URL(this.baseURL + "?verb=ListRecords&resumptionToken=" + this.resumptionToken);
                        }
                        catch(Exception ex) {
                            throw new OAIPMHException(ex); //never executed, baseURL is already valid for and URL object 
                        }
                    }
                    else {
                        throw new OAIPMHException("OAI-PMH harvesting stopped. The server's response was corrupted.\n(details: " + e.getMessage() + " )");
                    }
                }
            }
                        
            
            
            /*Check for an OAI-PMH error in the response.*/
            String error = this.contentHandler.getError(); 
            
            if(error != null) {
                String message = "An error was found in the OAI-PMH response: " + error;
                throw new OAIPMHException(message);
            }
            
            /*Check the response for a resumption token and do needed operations.*/
            this.handleResumptionToken();
            
            /*Remove everything except record tags from the buffer.*/
            this.extractRecords();
            
		}
		catch(IOException e) {
			throw new OAIPMHException(e.getMessage());
		}
	}
	
    
    /**
     * Reads data from the current URL and fills the buffer with data.
     *
     */
    private void fillBuffer() throws IOException {
       
        this.buffer = new StringBuffer();
        
        /*read the data into the internal buffer*/
        URLConnection con = this.url.openConnection();
        con.setConnectTimeout(this.CONNECTION_TIMEOUT);

        this.in = con.getInputStream();
        BufferedReader reader = new BufferedReader(new InputStreamReader(this.in));

        String line = reader.readLine();
        
        while(line != null) {
            this.buffer.append(line + "\n");
            line = reader.readLine();
        }
        
        this.in.close();
    }
    
    /**
     * Searches the xml document for a resumptionToken. If one is found this object's state is changed accordingly (fields <code>resumptionToken</code> and <code>url</code>). The current content in <code>buffer</code> must be a string representation of a valid xml document.
     * @throws OAIPMHException
     */
	private void handleResumptionToken() throws OAIPMHException {
        
		this.resumptionToken = this.contentHandler.getResumptionToken();
        
        if(this.resumptionToken != null) {
            
            try {
                this.url = new URL(this.baseURL + "?verb=ListRecords&resumptionToken=" + this.resumptionToken);
            }
            catch(Exception e) {
                throw new OAIPMHException(e); //never executed, baseURL is already valid for and URL object 
            }
        }
    }
    
    /**
     * If the received data is corrupt and cannot be parsed as an xml then this method
     * can be used for searching for a resumptionToken
     * @return
     * The resumptionToken as a String. null if a resumptionToken was not found.
     */
    private String searchResumptionTokenFromCorruptXML() {
        
        String resumptionToken = null;
        
        String rtStart = "<resumptionToken[^>]*>";
        String rtEnd = "</resumptionToken>";
        
        Pattern startPattern = Pattern.compile(rtStart);
        Pattern endPattern = Pattern.compile(rtEnd);
        
        Matcher startMatcher = startPattern.matcher(this.buffer);
        Matcher endMatcher = endPattern.matcher(this.buffer);
        
        if(startMatcher.find() && endMatcher.find()) {
            
            if(startMatcher.end() > endMatcher.start()) {
                return null;
            }
            
            resumptionToken = this.buffer.substring(startMatcher.end(), endMatcher.start());
        }
        
        return resumptionToken;
    }
    
    
    /**
     * This method clears the buffer from all other markup except <code>record</code> tags.
     *
     */
    private void extractRecords() {
    	StringBuffer result = new StringBuffer();
        
        String startTag = "<record>";
		String endTag = "</record>";
		
		int startIndex = this.buffer.indexOf(startTag); 
		int endIndex = this.buffer.indexOf(endTag);
		
		while(startIndex >= 0 && endIndex >= 0) {
			result.append(this.buffer.substring(startIndex, endIndex + endTag.length()));
			this.buffer.delete(startIndex, endIndex + endTag.length());
			startIndex = this.buffer.indexOf(startTag);
			endIndex = this.buffer.indexOf(endTag);
		}
		
		this.buffer = result;
    }

}
