function yy = buildOligoArray(fastafile, varargin)
% updated 201002: reject oligos with ambiguous bases N.
% DESCRIPTION
%      function buildOligoArray is written to design specific oligonucleotide
%      probes targeting nucleic acid sequences (RNA, or genomic sequence, etc.).
% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
% Inputs:
%   fastafile: the fasta file of targeted nucleic acid sequences. 
%
%   varargin: name/value pairs
%   'ProbeLength' - length of probe (nt). Default is 30.
%
%   'MinTm' - minimum Tm of probe (Celsius). Default is 66.
%
%   'MaxTm' - maximum Tm of probe (Celsius). Default is 100.
%
%   'SecondaryStructureTm' - maximum secondary structure Tm of probe (Celsius).
%   Default is 76.
%
%   'CrossHybTm' - maximum cross-hybridization Tm between probes (Celsius).
%   Default is 72.
%
%   'MinGC' - minimum GC percentage. Default is 30.
%
%   'MaxGC' - maximum GC percentage. Default is 90.
%
%   'ExcludeSeq' - sequences to exclude. Put between "" and use | to separate 
%   multiple sequences. Default is "GGGGGG|CCCCCC|TTTTTT|AAAAAA".
%
%   'ProbeGap' - number of nucleotides separated between the start positions  
%   of adjacent probes. For example, when probe length is 30, 31 means no 
%   overlap and no gap between adjacent probes; 11 means 20nt overlap between 
%   adjacent probes. Default is 31.
% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
% Output:
%   The function generates a oligos.fasta file of all qualified oligo probes 
%   sequences that pass through all criteria. And returns a structure that
%   records the headers and sequences of all good oligos.

% initialize default parameters
Plength = 30; % length of probe
t = 66; % minimum Tm of probe
T = 100; % maximum Tm of probe
s = 76; % maximum secondary structure Tm of probe
x = 72; % maximum cross-hybridization Tm
p = 30; % minimum GC percentage
P = 90; % maximum GC percentage
m = "GGGGGG|CCCCCC|TTTTTT|AAAAAA"; % sequences to exclude
g = 31; % gap between adjacent probes

% get input parameters
if ~isempty(varargin)
    for i=1:length(varargin)
        % Probe Length
        if strcmp(varargin{i},'ProbeLength')
            Plength = varargin{i+1};
        end
        % minimum Tm of probe
        if strcmp(varargin{i},'MinTm')
            t = varargin{i+1};
        end
        % maximum Tm of probe
        if strcmp(varargin{i},'MaxTm')
            T = varargin{i+1};
        end
        % maximum secondary structure Tm of probe
        if strcmp(varargin{i},'SecondaryStructureTm')
            s = varargin{i+1};
        end
        % maximum cross-hybridization Tm
        if strcmp(varargin{i},'CrossHybTm')
            x = varargin{i+1};
        end
        % minimum GC percentage
        if strcmp(varargin{i},'MinGC')
            p = varargin{i+1};
        end
        % maximum GC percentage
        if strcmp(varargin{i},'MaxGC')
            P = varargin{i+1};
        end
        % sequences to exclude
        if strcmp(varargin{i},'ExcludeSeq')
            m = varargin{i+1};
        end
        % gap between adjacent probes
        if strcmp(varargin{i},'ProbeGap')
            g = varargin{i+1};
        end
    end
end

% get the headers and sequences information from the input fasta file.
warning('off') 
[Head Seq] = fastaread(fastafile);
if ~iscell(Seq)
    SeqString = Seq;
    HeadString = Head;
    clear Head
    clear Seq
    Seq{1} = SeqString;
    Head{1} = HeadString;
end

oligos = {};
% loop through each gene/sequence and get the oligos
for k = 1:length(Seq)
    NOligo = 0; % number of good oligos for each gene
    OligoStart = 1; % start position of oligo
    OligoEnd = OligoStart + Plength - 1; % end position of oligo
    GoodOligo = {}; % a structure that records all good oligos
    
    while OligoEnd <= length(Seq{k})
        Oligo = Seq{k}(OligoStart:OligoEnd); % get the oligo sequence
        
        % check oligo Tm:
        % 'Salt' - salt concentration in 1 moles/liter for Tm calculations
        % 'Primerconc' - specifies the concentration of each oligo in moles/liter
        SeqProperties = oligoprop(Oligo,'Salt',1,'Primerconc',1e-6);
        
        % use the 5th Tm calculated with Nearest-neighbor method (SantaLucia Jr., 1998)
        Tm = SeqProperties.Tm(5);
        
        % if the oligo Tm doesn't meet the criteria, reject this oligo and
        %  the start position of next oligo moves one nucleotide forward.
        if Tm > T || Tm < t
            OligoStart = OligoStart + 1;
            OligoEnd = OligoStart + Plength - 1;
            continue
        end
        
        % check oligo GC percentage:
        % if it doesn't meet the criteria, moves to next oligo window.
        if SeqProperties.GC > P || SeqProperties.GC < p
            OligoStart = OligoStart + 1;
            OligoEnd = OligoStart + Plength - 1;
            continue
        end
        
        % check nucleotide repeats:
        % use function regexpi to check any strings that match
        % the character patterns specified by m (sequences to exclude). If
        % exists, reject the oligo and move to next.
        if ~isempty(regexpi(Oligo,m))
            OligoStart = OligoStart + 1;
            OligoEnd = OligoStart + Plength - 1;
            continue
        end
        
        if  ~isempty(regexpi(Oligo,'n'))
            OligoStart = OligoStart + 1;
            OligoEnd = OligoStart + Plength - 1;
            continue
        end
        
        
        % check secondary structure Tm:
        RNAbracket = rnafold(Oligo);
        
        % The '(' indicates the pairing sequences of stems.
        % Concatenate all stem sequences and calculate the Tm.
        SSid = strfind(RNAbracket,'(');
        SSeq = Oligo(SSid);
        
        % if there exists secondary structure, calculate the Tm of concatenated stem sequence.
        % if the Tm exceeds the maximum secondary structure Tm, reject the
        % oligo.
        if ~isempty(SSeq)
            SSeqProp = oligoprop(SSeq);
            if SSeqProp.Tm(5) > s
                OligoStart = OligoStart + 1;
                OligoEnd = OligoStart + Plength - 1;
                continue
            end
        end
        
        % check cross-hybridization Tm:
        if ~isempty(GoodOligo)
            CrossTm = []; % a list that records all pairwise cross-hyb Tm
            for i = 1:length(GoodOligo)
                
                % use function swalign to get the pairing sequence between
                % the reverse-complement of current oligo and each
                % of existing good oligo.
                [~, Alignment] = swalign(GoodOligo{i}, seqrcomplement(Oligo));
                AliSeqid = strfind(Alignment(2,:),'|');
                AliSeq = Alignment(1,AliSeqid);
                
                % calculate the Tm of the concatenated aligned sequence as
                % the cross-hyb Tm, and add it to CrossTm list.
                if ~isempty(AliSeqid)
                    CrossProps = oligoprop(AliSeq);
                    CrossTm = [CrossTm CrossProps.Tm(5)];
                end
            end
            
            % if the maximum Tm in CrossTm list exeeds the criteria, reject
            % this oligo.
            if max(CrossTm) > x
                OligoStart = OligoStart + 1;
                OligoEnd = OligoStart + Plength - 1;
                continue
            end
        end
        
        % when the oligo passes through all the critera written above, add it to GoodOligo.
        % Then move g nucleotides forward to get the next oligo.
        NOligo = NOligo + 1;
        GoodOligo{end+1} = Oligo;
        OligoStart = OligoStart + g - 1;
        OligoEnd = OligoStart + Plength - 1;
    end
    
    display([num2str(NOligo) ' oligos initially generated for ' Head{k} '.'])
    %probeDesigner.ProgressTextArea.Value=['There are in total ' num2str(NOligo) ' good oligos for ' Head{k} '.'];

    % generate a structure that stores the headers and sequences of all good oligos.
    
    for i=1:length(GoodOligo)
        oligos(end+1).Sequence = GoodOligo{i};
        oligos(end).FragID = [Head{k} '_Seq_' num2str(i)];
    end
       
end

for i = 1:length(oligos)
    oligos(i).Header = [num2str(i) '_' oligos(i).FragID];
end
yy = oligos;

% delete any existing oligo.fasta file in the current folder, and write the
% good oligos into oligos.fasta file.
if exist('oligos.fasta')
    delete('oligos.fasta');
end
fastawrite(['oligos.fasta'], oligos);

% ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
% This program is developed by Bing Yang. Last updated on 01/07/2021. All 
% code contained here is licensed under the creative commons CC BY NC.
              







