% This file contains a MATLAB implementation of the sparse dynamic programming algorithm for computing gap-weighted substring kernels, described in the article: % Juho Rousu & John Shawe-Taylor: Efficient Computation of Gapped Substring Kernels on Large Alphabets. Journal of Machine Learning Research 6 (2005), 1323-1344 % % DISCLAIMER: This code is provided AS IS with no warranty of any kind. % The code has been optimizaed for efficiency and does not contain error checks of any kind. For documentation please refer to the above article or to the comments in the code. % Although the code is not actively maintained, questions and bug reports may be sent to the first author. % % (c) 2003-2005 Juho Rousu (firstname dot lastname at cs dot helsinki dot fi) function [Kappa] = GapWeightedSubsequences_Sparse(s,t,params) % strings s and t are given as integer (column) vectors, s(i),t(i) in [1:sizeofalphabet] % the structure params contains additional parameters: % params.seqlen : subsequence length limit % params.alphaSize : size of alphabet % params.lambda : gap penalty, 0 < lambda <= 1, setting lambda = 1 means no % gap penalty, lower values penalize more heavily global V_STRIP_WIDTH; global H_STRIP_HEIGHT; m = length(s); n = length(t); if nargin < 3 p = min(m,n); alphaSize = max(max(s),max(t)); lambda = 1; else p = params.seqlen; alphaSize = params.alphaSize; lambda = params.lambda; end % the range of the character indices i = 1:m, j = 1:n is divided into % strips to avoid numeric underflow when multiplying by powers of lambda % the smaller |lambda-1| is the larger the strips can be if (lambda < 1) V_STRIP_WIDTH = 2^floor(log2(min(-150/log10(lambda),n))); H_STRIP_HEIGHT = 2^floor(log2(min(-150/log10(lambda),m))); else V_STRIP_WIDTH = n; H_STRIP_HEIGHT = m; end % Match sets are stroed in lists, one for each s(i), stored as matrix form % as follows: kappa(i,j) is stored in MatchSet{i} as [j,kappa(i,j)]'. MatchList = createMatchListFrom(s,t,lambda,alphaSize); % match list of length l= 1 subsequences NewMatchList = cell(m,1); % match list for length l+1 subsequences, to be filled % A binary search tree Weight (implicitly stored in a matrix) is used as the data % structure to compute sums % of kernel values of proper prefixes. From the tree answers to one-dimensional % range queries of the type ]-inf,a] are found in log(n) time. In each node % the sum of weight of the key at the node and those in the left subtree is % stored. Weight = zeros(ceil(n/V_STRIP_WIDTH),V_STRIP_WIDTH); % When updating the tree, only nodes whose left child belongs to the path % need to be updated. When querying it suffices to sum over those nodes % whose right child in on the search path. In addition the node containing the query % key is always updated and its weight added to query.. [QueryPath, UpdatePath] = precompute_rangepaths_opt(min(n,V_STRIP_WIDTH+1)); % the main algorithm computes kernel values incrementaly in incresing order of % subsequence length, by traversing the match lists MatchLists{i} in increasing order of i % and making queries to the Weight-tree. for l = 2:p % starting from the first horizontal strip h_strip_number = 1; h_strip_max = H_STRIP_HEIGHT; firstrow = 1; % the first non-empty row to be seen still % process match lists for i = 1:m L = MatchList{i}; len = size(L,2); if len > 0 if ~firstrow NewL = zeros(size(L)); nindex = 0; % check if we crossed horizontal strip boundary if i > h_strip_max % how many strips? strips_passed = ceil((i-h_strip_max)/H_STRIP_HEIGHT); h_strip_number = h_strip_number + strips_passed; h_strip_max = h_strip_max + strips_passed*H_STRIP_HEIGHT; % rescale the Weights to be represented as offsets to % (h_strip_max,v_strip_max) Weight = Weight*lambda^(strips_passed*H_STRIP_HEIGHT); end % scan the match list, keeping track of which vertical strip we % are in % start from the first vertical strip v_strip_number = 1; v_strip_max = V_STRIP_WIDTH; v_strip_sum = 0; for j = 1:len % query the sum of weights (gap weighted match counts) of the proper prefixes % of s(1:i) and t(1:L(1,j)) node = L(1,j); % did we cross over a strip boundary? while node > v_strip_max % update the sum of weight of all strips crossed so far v_strip_sum = lambda^V_STRIP_WIDTH*(v_strip_sum + Weight(v_strip_number,V_STRIP_WIDTH)); v_strip_number = v_strip_number + 1; v_strip_max = v_strip_max + V_STRIP_WIDTH; end sumOfWeights = v_strip_sum; qnode = node - v_strip_max + V_STRIP_WIDTH; % offset from the strip boundary; q = 1; iq = QueryPath(q,qnode); while iq > 0 sumOfWeights = sumOfWeights + Weight(v_strip_number,iq); q = q+1; iq = QueryPath(q, qnode); end % store only non-zero values thus taking advantage of the % sparsity. if sumOfWeights > 0 nindex = nindex + 1; % NewL(1:2,nindex) = [node;sumOfWeights]; NewL(1,nindex) = node; NewL(2,nindex) = sumOfWeights; end end if nindex > 0 NewMatchList{i} = NewL(1:2,NewL(1,:) > 0); % this to eliminate anomalic 0x2 matrices else NewMatchList{i} = []; end end % if ~firstrow % update the weight table with matches in L v_strip_number = 1; v_strip_max = V_STRIP_WIDTH; for j = 1:size(L,2) node = L(1,j); while node > v_strip_max v_strip_number = v_strip_number + 1; v_strip_max = v_strip_max + V_STRIP_WIDTH; end w_node = node - v_strip_max + V_STRIP_WIDTH; w = L(2,j); u = 1; iw = UpdatePath(u,w_node); while iw > 0 Weight(v_strip_number,iw) = Weight(v_strip_number,iw) + w; u = u+1; iw = UpdatePath(u, w_node); end end if firstrow firstrow = 0; % we have seen an processed the firstrow NewMatchList{i} = []; end end end Weight(:,:) = 0; MatchList = NewMatchList; end % compute the kappa values for the final level Kappa = 0; for i = 1:m h_strip_max = ceil(i/H_STRIP_HEIGHT)*H_STRIP_HEIGHT; L = MatchList{i}; for j = 1:size(L,2) v_strip_max = ceil(L(1,j)/V_STRIP_WIDTH)*V_STRIP_WIDTH; % rescaling the kappa values by lambda^(i+j) to counteract the % scaling done in preprocessing % Kappa = Kappa + L(2,j)*exp((i-h_strip_max+L(1,j)-v_strip_max)*log(lambda))'; Kappa = Kappa + L(2,j)*lambda^(i-h_strip_max+L(1,j)-v_strip_max); end end % normalising to ensure = 1 Kappa = Kappa/lambda^(2*p); function L = createMatchListFrom(s,t,lambda,alphaSize) global H_STRIP_HEIGHT; global V_STRIP_WIDTH; Index = cell(alphaSize,1); for c = 1:alphaSize Index{c} = find(t == c); if size(Index{c},1) == 0 Index{c} = []; end end % store the match weights as offsets from nearest strip lower righthand corner L = cell(length(s),1); for i = 1:length(s) h_strip_max = ceil(i/H_STRIP_HEIGHT)*H_STRIP_HEIGHT; I = Index{s(i)}; TmpList = zeros(2,size(I,2)); for j = 1:size(TmpList,2) v_strip_max = ceil(I(j)/V_STRIP_WIDTH)*V_STRIP_WIDTH; TmpList(1:2,j) = [I(j);exp((2+h_strip_max-i+v_strip_max-I(j))*log(lambda))]; end L{i} = TmpList; end function [Query,Update] = precompute_rangepaths_opt(n) % computes paths to the range sum tree for updating and querying, % enables the updates and queries to be made in a simple for-loop % rather than explicitly traversing the tree highbit = ceil(log2(n+1)); powersof2 = [exp((highbit-1:-1:0)*log(2))]; if highbit == 0 Query = []; Update = []; return; else Query = zeros(highbit+1,n+1); Update = zeros(highbit+1,n+1); end rootindex = 2^(highbit); for j = 1:(n-1) % compute the indices along the path from root to the node % corresponding to j bits = bitget(j,highbit:-1:1); path = round((2*[0,bits(1:end-1)]-1).*powersof2); path(1) = path(1) + rootindex; for i = 2:highbit path(i) = path(i) + path(i-1); end % search for the least significant bit lsb = highbit; while bits(lsb) == 0 lsb = lsb-1; if lsb == 1 break; end end % now, compute the partial paths for queries and updates % Note that we actually want to exclude the query key from the range, % hence the indexing j+1 qp = path(bits(1:lsb) == 1); Query(1:length(qp),j+1) = qp'; up = [path(bits(1:lsb-1) == 0),path(lsb)]'; up = up(up <= n); Update(1:length(up),j) = up; end