package maito.util;

import java.io.*;
import java.util.Properties;

/**
 * A class that contains static utility methods.
 * Could be optimized (and made a bit uglier...) by compiling the RegExp 
 * patterns globally etc.
 * 
 * @version 1.0
 * @author Antti Laitinen
 * @author Väinö Ala-Härkönen
 * @author Tuomas Tanner
 * @autohr Reima Halmetoja
 */
public class Tools {
    
	/** The full path for the dbconfig.properties file */
	public static final String PATH_DBCONFIG = "config" + File.separator + "dbconfig.properties";
	/** The full path for the dbconfig.properties file */
	public static final String PATH_RESOURCEGRAPH_SQL = "config" + File.separator + "resourcegraph.sql";
	/**
	 * The full path for the resource graph file
	 */
	public static final String PATH_RAWDATA_SQL = "config" + File.separator + "rawdata.sql";
	/**
	 * The full path for the dbconfig.properties file
	 */
	public static final String RESOURCENET_PATH = "config" + File.separator + "resourcegraph.sql";
    /**
     * Person name format "Firstname M. Surname", several persons separated with a comma
     */  
    public static final int ACTOR_CITESEER = 0; 
    /**
     * Person name format "Surname, Firstname M.", one actor per field
     */
    public static final int ACTOR_OTHER = 1;
    
    public static final String DATASOURCE_PARAM_LOCATION = "location";
    
    public static final String DATASOURCE_PARAM_UPDATED = "updated";
    
    private static final String[] ORGKEYWORDS = {" INC"," LTD"," OF "," CO.", " CORP", " PRESS", " UNIV", " PUBL"};
    private static final String[] PERSONKEYWORDS = {" ED.", " EDS.", "ET AL.", "ET ALII.", " ANON.", " JR.", " SR."};

	private Tools() {} // nothing to execute here
	
    /** 
     * Canonizes a String without any special heuristics or transformation: 
     * 1. 1-n consequent whitespaces to one space
     * 2. Remove all characters except for A-Z a-z 0-9 , . / - : ~
     * (Note: this means characters like å, ä, ö are gone too which 
     * IS according to specs, not a bug)
     * 3. Convert to upper case
     * 
     * @param str The original string
     * @return The canonized version of the string - null if the original String was null
     */
    public static String canonizeGeneric(String str) {
        String canonizedStr;
        if (str != null) {
            canonizedStr = str.trim();
            canonizedStr = canonizedStr.replaceAll("\\s+", " "); // Collapse 1-n consequent whitespaces to one space
            canonizedStr = canonizedStr.replaceAll("[\\W&&[^\\,\\.\\/\\-\\:\\~ ]]",""); // Remove all unwanted special characters
            canonizedStr = canonizedStr.toUpperCase();
            return canonizedStr;
        }
        else
            return null;
    }
    
    /**
     * Canonizes an Actor string according to the specifications.
     * 
     * @param str 
     * A String containing one or more actors
     * @param dataType
     * Type of Actor data to be canonized, use ACTOR_-constants
     * @return 
     * A String array: the first String specifies type of the
     * actor(s) according to specifications ("Organisaatio"/"Henkilö"/"Joku").
     * The rest of the Strings are names of the actors.
     * Value null is returned if original String is null.
     */
    public static String[] canonizeActor(String str, int dataType) {
        if (str == null)
            return null;
        String actorType = "Joku"; // First assumption
        
        if ((str.indexOf("://") != -1) || (str.indexOf("www.") != -1)) { // Is it a site?
            actorType = "Joku";
            String[] actorList = { actorType, str };
            return actorList;
        }
        
        else {
            String canonizedStr = canonizeGeneric(str); 
            for (int i = 0; i < ORGKEYWORDS.length; i++) { // Is it an organization?
                if (canonizedStr.indexOf(ORGKEYWORDS[i]) != -1) {
                    actorType = "Organisaatio";
                    String[] actorList = { actorType, canonizedStr };
                    return actorList;
                }
            }
            if (canonizedStr.matches(".* [A-Z]\\..*")) // Is it a name?
                    actorType = "Henkilö"; // This is never reached in actuality when using names such as "John Smith".. -RH
            else for (int i = 0; i < PERSONKEYWORDS.length; i++) { // Could it be name anyway?
                if (canonizedStr.indexOf(PERSONKEYWORDS[i]) != -1) {
                    canonizedStr = canonizedStr.replaceAll(PERSONKEYWORDS[i], "").trim(); // remove the little words
                    actorType = "Henkilö";
                }
            } 

            if (actorType.equals("Henkilö")) { // Splitting the person entries
                // CiteSeer format
                if (dataType == ACTOR_CITESEER) {
                    String[] actors = canonizedStr.split(",");
                    int lastSpace;
                    String thisName;
                    for (int i = 0; i < actors.length; i++) {
                        actors[i] = actors[i].trim();
                        lastSpace = actors[i].lastIndexOf(" ");
                        if (lastSpace != -1) { // There's two parts in the name
                            thisName = actors[i].substring((lastSpace+1), (actors[i].length()));
                            thisName = thisName + ", " + actors[i].substring(0,1);
                            actors[i] = thisName;
                        } // Otherwise there's only one name, keep it like that
                    }
                    // In the end, wrap it all up and return
                    String[] actorList = new String[actors.length + 1];
                    actorList[0] = actorType;
                    for (int i = 1; i < actorList.length; i++) { // populate the rest with recognized actors
                        actorList[i] = actors[i-1];
                    }
                    return actorList;
                }
                
                // In any other case we're assuming ACTOR_OTHER since no more actor formats at the moment
                else { 
                    int splitpoint = (canonizedStr.indexOf(", "));
                    if (splitpoint != -1) { 
                            /* Return the String before split point and one character after it.
                             * Note that after the basic canonization the string is trimmed so there MUST
                             * be something at splitpoint + 3 so assuming this won't cause a bug here
                             */
                            canonizedStr = canonizedStr.substring(0, (splitpoint+3));
                            String[] actorList = { actorType, canonizedStr };
                            return actorList;
                    }
                    else { // No comma
                        splitpoint = (canonizedStr.indexOf(" "));
                        if (splitpoint != -1) { // Found space, there's bound to be more after it
                            String returnStr = canonizedStr.substring(0, splitpoint) + ", " + canonizedStr.substring(splitpoint+1, splitpoint+2);
                            String[] actorList = {actorType,returnStr};
                            return actorList;
                        }
                        else { // Only one word...
                            String[] actorList = {actorType,canonizedStr};
                            return actorList;
                        }
                    }
                }
                
            }
        }
        // And if everything else fails it's the default type...
        str = canonizeGeneric(str);
        String[] actorList = { actorType, str };
        return actorList;
    }
    
    /**
     * Canonizes / normalizes a Date String according to the specifications.
     * Date format used is ISO 8601, http://www.w3.org/TR/NOTE-datetime
     * but we're only saving the YYYY-MM-DD part of it
     * 
     * @param str A String containing a date
     * @return The canonized version of the String, null if date not valid
     */
    public static String canonizeDate(String str) {
        if (str == null)
            return null;
        str = str.trim();
        if (str.length() < 10) // Can't be a valid ISO 8601 date...
            return null;
        else {
            String dateString = str.substring(0,10);
            if (dateString.matches("\\d{4}-\\d{2}-\\d{2}")) { // in format xxxx-xx-xx where x is a number
                return dateString;
            }
            else
                return null;
        }
    }
    
    /**
     * Canonizes / normalizes a Language String according to the specifications.
     * Language code format used is 2- or 3-character ISO639 without additional 
     * identifiers. Validity of the code is not checked, just the format.
     * 
     * @param str A String containing a language identifier
     * @return The canonized version of the String or null if format not valid
     */
    public static String canonizeLang(String str) {
        if (str == null)
            return null;
        String canonizedStr = canonizeGeneric(str);
        String[] tokens = canonizedStr.split("-");
        if (tokens[0].matches("[A-Z]{2,3}")) // the first part of the code is 2-3 letters, okay
            return tokens[0];
        else
            return null;
    }
    
    /**
     * Canonizes a Title String according to the specifications
     * 
     * @param str A String containing a title
     * @return The canonized version of the String or null if the String was null
     */
    public static String canonizeTitle(String str) {
        if (str == null)
            return null;
        String canonizedStr;
        canonizedStr = str.replaceAll("(\\s+|\\G|^)\\S{0,3}(\\s+|$)", " "); // remove all words less than 4 characters
        canonizedStr = canonizeGeneric(canonizedStr);
        return canonizedStr;
    }
    
    /**
     * Categorizes an Identifier String according to the specifications
     * 
     * @param str 
     * The identifier to be categorized
     * @return 
     * The type of identifier that the heuristic assumed. The name of the 
     * type is returned in the program's common atomic statement property format - 
     * see specifications.
     * If the identifier String is null, returns null.
     */
    public static String categorizeIdentifier(String str) {
        if (str == null)
            return null;
        String category = "SisäinenTunniste";
        String compareStr = str.toLowerCase().trim();
        if (compareStr.startsWith("http"))
            category = "URLTunniste";
        else if (compareStr.startsWith("urn"))
            category = "URNTunniste";
        else if (compareStr.startsWith("info:oai"))
            category = "OAITunniste";
        else if (compareStr.startsWith("info:ofi"))
            category = "OpenURLTunniste";
        else if (compareStr.startsWith("info:doi"))
            category = "DOITunniste";
        else if (compareStr.startsWith("info"))
            category = "INFOTunniste";        
        else if (compareStr.length() > 16) {
            if (compareStr.matches("(.*(\\.|,)+.*){2,}")) { // at least 2 commas and/or dots
                String strStart = compareStr.substring(0, 4);
                String strEnd = compareStr.substring(compareStr.length() - 4);
                if (strStart.matches("(1[8-9]\\d{2})|(20[0-1]\\d)") || strEnd.matches("(1[8-9]\\d{2})|(20[0-1]\\d)")) { // Year 1800-2019 in the beginning or the end
                    category = "ViittausTunniste";
                }
            }
        }
        else if (compareStr.indexOf("://") != -1)
            category = "URITunniste";
        return category;
    }
    
    /**
     * Splits an Actor string to a table of substrings provided. The devider
     * character is ','. This method does not canonize the actors.
     * 
     * @param str 
     * A String containing one or more actors (persons split by ',').
     * @param dataType
     * Type of Actor data to be canonized, use ACTOR_-constants.
     * @return 
     * A String array: 
     * A String array: the first String specifies type of the
     * actor(s) according to specifications ("Organisaatio"/"Henkilö"/"Joku").
     * The rest of the Strings are the uncanonized actors.
     * Value null is returned if original String is null.
     */
    public static String[] splitActor(String str, int dataType) {
        if (str == null)
            return null;
        
        String actorType = determineActorType(str);

        if (actorType.equals("Henkilö") && dataType == ACTOR_CITESEER) { 
            // CiteSeer format
            // Split the person entries
            return splitPerson(str);
        }
        // In any other case we're assuming ACTOR_OTHER since no more actor formats at the moment
        else {
            // Only one word...
            String[] actorList = { actorType, str };
            return actorList;
        }
    }

    /**
     * Determines what type of Actor a String specifies.
     * 
     * @param str 
     * A String containing one or more actors (persons split by ',').
     * @param dataType
     * Type of Actor data to be canonized, use ACTOR_-constants.
     * @return 
     * A String array: 
     * A String which specifies type of the actor(s) according to
     * the specifications ("Organisaatio"/"Henkilö"/"Joku").
     */
    
    private static String determineActorType(String str) {
        String actorType = "Joku"; // First assumption        
        
        // Only start determining the actor type if it's not a website ("Joku").
        if ( !( (str.indexOf("://") != -1) || (str.indexOf("www.") != -1)) ) { 
            
            // We have to canonize because comparing the keywords won't work otherwise
            String canonizedStr = canonizeGeneric(str); 
                        
            // Is it an organization?
            for (int i = 0; i < ORGKEYWORDS.length; i++) {
                if (canonizedStr.indexOf(ORGKEYWORDS[i]) != -1) {
                    actorType = "Organisaatio";
                    return actorType;
                }
            }
            
            // Is it a name?
            if (canonizedStr.matches(".* [A-Z]\\..*")) { 
                actorType = "Henkilö";
            } // This is never reached in actuality with names such as "John Smith".
  
            //Could it be name anyway?
            else for (int i = 0; i < PERSONKEYWORDS.length; i++) { 
                if (canonizedStr.indexOf(PERSONKEYWORDS[i]) != -1) {
                    actorType = "Henkilö";
                    return actorType; //Return it here so we won't have to loop further
                }
            }
        }
        // Couldn't determine the actor type as anything else than the default "Joku"
        return actorType;
    }
    
    /**
     * Splits a String of "Henkilö"-type Actors to a table of
     * substrings. Utilized only when the dataType is known
     * to be "ACTOR_CITESEER".
     * 
     * @param str 
     * A String containing one or more actors (split by ',').
     * @return 
     * A String array: 
     * A String array: the first String specifies type of the
     * actor(s) ("Henkilö").
     * The rest of the Strings are the uncanonized actors.
     * Value null is returned if original String is null.
     */
    public static String[] splitPerson(String str) {
        if (str == null)
            return null;
        
        String[] actors = str.split(",");
        String[] actorList = new String[actors.length + 1];
        actorList[0] = "Henkilö";
        for (int i = 1; i < actorList.length; i++) { // populate the rest with recognized actors
            actorList[i] = actors[i-1];
        }
        return actorList;
    }
    
    /** 
     * Reads a text file created in the UTF-8 charset into a string 
     */
    public static String readFile(String fileName) {
        try {
            File file = new File(fileName);
            if (!file.isFile()) {
                return null;
            }
            BufferedReader fread = new BufferedReader(new InputStreamReader(
            new FileInputStream(file), "UTF-8"));
            
            char[] cbuf = new char[(int)file.length()];
            fread.read(cbuf, 0, (int)file.length());
            
            String fileCont = new String(cbuf);
            fread.close();
            file = null;
            return fileCont;
        }
        catch (Exception e) {
            return null;
        }
    }
    
    /** Saves a text file in the UTF-8 character encoding
     *  @param fileName the name of the file with full path information
     *  @param contents what to write
     *  @param append append to the end of an existing file or write over it
     *  @returns true if successfull, false if an exception occurred
     */ 
    public static boolean saveFile(String fileName, String contents, boolean append) {
        try {
            BufferedWriter file = new BufferedWriter(new OutputStreamWriter(
                                  new FileOutputStream(fileName, append), "UTF-8"));
            file.write(contents);
            file.close();
        }
        catch (Exception e) {
            return false;
        }
        return true;
    }
    
    public static Properties loadProperties(String filename) {
    	try {
    		Properties temp = new Properties();
			temp.load(new FileInputStream(new File(filename)));
			return temp;
		}
		catch(Exception e) {
             System.out.println("ERROR in Tools.loadProperties: " + e);
			return null;
		}    	
    }
    
}
