PrimedRPA

#! /usr/bin/env python3

#####################################################################
#  PrimedRPA: RPA Primer and Probe Set Finder                       #
#  Higgins M et al. Submitted. 2018                                 #
#                                                                   #
#  Dependencies:                                                    #
#     Python 3.7
#     Glob 0.6
#     Pandas 0.20.3                                                 #
#     Sys 3.6.3                                                     #
#     Bio 1.70                                                      #
#####################################################################


# Install neccessary python libraries
import os
import sys
import glob
import subprocess
import itertools
import random
import pandas as pd
from collections import Counter
from multiprocessing import Pool
import argparse
import re


def FastaToDict(InputFile):
	fastadict = {}
	with open(InputFile) as file_one:
		for line in file_one:
			line = line.strip()
			if not line:
				continue
			if line.startswith(">"):
				active_sequence_name = line[1:]
				if active_sequence_name not in fastadict:
					fastadict[active_sequence_name] = ''
				continue
			sequence = line
			# If Uracil Present, Substitute for T (I.e. stick with 4 DNA bases)
			fastadict[active_sequence_name] += re.sub('U','T',sequence.upper())
	return fastadict


# Wrapper for clustal omega
def RunningClustalo1(ClustalOPath,
					ListOfFileNames,
					overwriteOutput=True):

	print('Aligning Input File')
	for FileName in ListOfFileNames:
		OutputName = FileName.replace(".fasta",'_Aligned.fasta')
		command = "{0} -i {1} -o {2} --outfmt=fasta".format(ClustalOPath,FileName, OutputName)
		result = subprocess.call([command], stdout=subprocess.PIPE, shell=True,)


# Function to generate reverse complement sequence
def getComplement(seq,
				 reverse=False,
				 rule='N2N'):

	seqComp = ""
	for base in seq.upper():
		base = base.upper()
		if base == "A":
			seqComp += "T"
		elif base == "T":
			seqComp += "A"
		elif base == "C":
			seqComp += "G"
		elif base == "G":
			seqComp += "C"
		elif base == 'U':
			seqComp += "A"
		elif base == "N":
			if rule == 'N2-':
				seqComp += '-'
			elif rule == 'N2N':
				seqComp += "N"
		elif base == "-":
			seqComp += "N"

		# If Character not any of above assign it as N
		else:
			seqComp += "N"

	if reverse:
		return(seqComp[::-1])
	else:
		return(seqComp)


## Background Binding Check = TO IMPROVE
def BlastnBackgroundCheck(seq,AllParameter):


	MaxBackgroundScoreBindingScore = 0
	MaxScoreBackSeq = ''
	HardFailBoolean = False

	RIS = str(random.randint(0,1000000))


	#Create temp fasta file
	tempfastainput = open('./{}/{}_{}_Blastn_Input.fa'.format(AllParameter.PrimerBlastnOutput,seq,RIS),'w')
	tempfastainput.write('>Temp_Blastn_Fasta\n{}\n'.format(seq))
	tempfastainput.close()


	# Parameters default for basic run
	Penalty = '-2'
	WordSize = '4'
	if AllParameter.BackgroundSearchSensitivity == 'Advanced':
		Penalty = '-1'

	elif AllParameter.BackgroundSearchSensitivity == 'Fast':
		Penalty = '-3'
		WordSize = '7'


	#Triggure Blastn command
	blastncommandrun = '{0} -word_size {5} -gapopen 5 -gapextend 2 -reward 1 -penalty {4} -evalue {7}  -query  {3}/{1}_{6}_Blastn_Input.fa -db {2}/{2} -out {3}/{1}_{6}_Blastn_Output.csv -outfmt "10 sseqid pident qstart qend sstart send evalue gaps sstrand" '.format(AllParameter.BlastnPath,
																																													 				   														seq,
																																																														 	AllParameter.BlastnDBName,
																																													 			  	   							 							AllParameter.PrimerBlastnOutput,
																																																															Penalty,
																																																															WordSize,
																																																															RIS,
																																																															AllParameter.Evalue)


	subprocess.call([blastncommandrun],shell=True)

	# Remove Blastn Input File
	os.remove('./{}/{}_{}_Blastn_Input.fa'.format(AllParameter.PrimerBlastnOutput,seq,RIS))

	# Try to read dataframe (may be empty if no alignments found)
	try:

		ShorterBackground = False

		blastnoutdf = pd.read_csv('{}/{}_{}_Blastn_Output.csv'.format(AllParameter.PrimerBlastnOutput,seq,RIS),header=None)
		blastnoutdf.columns=['Background_SeqID','Percentage Identity','qStart','qEnd','BackSeq_Start','BackSeq_End','Evalue','Number Of Gaps','Strand']
		blastnoutdf['Cross Reactivity Score'] = ((((blastnoutdf['qEnd']+1) - blastnoutdf['qStart'])*(blastnoutdf['Percentage Identity']/100))/len(seq))*100
		blastnoutdf = blastnoutdf.sort_values(by=['Cross Reactivity Score'],ascending=False)


		for blastoutindex in blastnoutdf.index.tolist():

			ReferenceID = blastnoutdf.loc[blastoutindex,'Background_SeqID']
			if ReferenceID.count("|") == 2:
				CleanRefID = ReferenceID[ReferenceID.find("|")+1:-1]
			else:
				CleanRefID = ReferenceID

			# If Background Seq Is (+) Sense
			if blastnoutdf.loc[blastoutindex,'Strand']=='plus':

				UpperExtension = (len(seq)-blastnoutdf.loc[blastoutindex,'qEnd']) + blastnoutdf.loc[blastoutindex,'BackSeq_End']
				LowerExtension = blastnoutdf.loc[blastoutindex,'BackSeq_Start'] - blastnoutdf.loc[blastoutindex,'qStart']+1

				if LowerExtension<0:
					LowerExtension = 0
					ShorterBackground = True

				SamToolsCommand = '{} faidx {} {}:{}-{} -o Adv_{}_{}_{}.fa'.format(AllParameter.SamtoolsPath,
															   AllParameter.BackgroundCheck,
															   CleanRefID,
															   LowerExtension,
															   UpperExtension,
															   seq,
															   CleanRefID,
															   RIS)

			# If Background Seq Is (-) Sense
			else:
				UpperExtension = blastnoutdf.loc[blastoutindex,'BackSeq_Start'] +  blastnoutdf.loc[blastoutindex,'qStart']-1
				LowerExtension = blastnoutdf.loc[blastoutindex,'BackSeq_End'] - (len(seq)-blastnoutdf.loc[blastoutindex,'qEnd'])


				if LowerExtension<0:
					LowerExtension = 0
					ShorterBackground = True

				SamToolsCommand = '{} faidx -i {} {}:{}-{} -o Adv_{}_{}_{}.fa'.format(AllParameter.SamtoolsPath,
																			   AllParameter.BackgroundCheck,
																			   CleanRefID,
																			   LowerExtension,
																			   UpperExtension,
																			   seq,
																			   CleanRefID,
																			   RIS)

			# Run Samtools Command
			try:
				subprocess.run(SamToolsCommand, shell=True, check=True, capture_output=True)
			except subprocess.CalledProcessError as cpe:
				print("stderr:", cpe.stderr.decode())
				raise

			# Obtain Sequence
			fastadict = FastaToDict('Adv_{}_{}_{}.fa'.format(seq,CleanRefID,RIS))

			# Extract Sequence & Complement
			ExtraSeq = list(fastadict.values())[0]
			ComplementSeq = getComplement(ExtraSeq)

			# Bug Fix 2020-07-06 - I.e. Only run if global alignment is complete
			if len(ComplementSeq) == len(seq):


				# Run SS Check Function
				BackgroundMaxBindingPercentage, BackgroundMaxBindingString, BackgroundHardFail, BackgroundString = SSIdentification(seq, ComplementSeq, False, ShorterBackground)

				# Add to Blastn DataFrame
				blastnoutdf.loc[blastoutindex,'Advanced Cross Reactivity Percentage'] = BackgroundMaxBindingPercentage
				blastnoutdf.loc[blastoutindex,'Advanced Cross Reactivity Percentage String'] = BackgroundMaxBindingString
				blastnoutdf.loc[blastoutindex,'Cross Reactivity Hard Fail'] = BackgroundHardFail
				blastnoutdf.loc[blastoutindex,'Cross Reactivity Hard Fail String'] = BackgroundString


			# Remove Advance Samtools Generated String
			os.remove('Adv_{}_{}_{}.fa'.format(seq,CleanRefID,RIS))

		# Remove Raw Blastn Output
		os.remove('{}/{}_{}_Blastn_Output.csv'.format(AllParameter.PrimerBlastnOutput,seq,RIS))

		# Save Clean Blast Output
		blastnoutdf = blastnoutdf.sort_values(by=['Advanced Cross Reactivity Percentage'],ascending=False)
		blastnoutdf.to_csv('{}/{}_Blastn_Output.csv'.format(AllParameter.PrimerBlastnOutput,seq),index=None)
		IndexOfInterest = blastnoutdf.index.tolist()[0]

		MaximumPercentageMatch = blastnoutdf.loc[IndexOfInterest,'Advanced Cross Reactivity Percentage']
		MaxHomologyBackgroundSeq = blastnoutdf.iloc[IndexOfInterest,0]


		if True in blastnoutdf['Cross Reactivity Hard Fail'].unique():
			HardFailBoolean = True


	# If dataframe empty e.g. no alignments at all
	except pd.errors.EmptyDataError:
		MaximumPercentageMatch = 0

	if MaximumPercentageMatch > MaxBackgroundScoreBindingScore:
		MaxBackgroundScoreBindingScore = MaximumPercentageMatch
		MaxScoreBackSeq = MaxHomologyBackgroundSeq


	return (MaxBackgroundScoreBindingScore, MaxScoreBackSeq, HardFailBoolean)


# Creates Exo Fluorescent Probe
def RunFluroProbeAnalysis(ProbeBindingSeq):

	minIndexPosition = int(len(ProbeBindingSeq)*0.45)
	maxIndexPosition = int(len(ProbeBindingSeq)*0.75)

	ProbeValidPass = False
	basenumber = 0
	proberegionlength = len(ProbeBindingSeq)
	for base in ProbeBindingSeq:
		if (basenumber + 4) < proberegionlength:
			basenumber += 1
			if minIndexPosition < basenumber < maxIndexPosition and base == "T":
				if (basenumber + 2) < proberegionlength:
					if ProbeBindingSeq[(basenumber+2)] == "T" or ProbeBindingSeq[(basenumber+3)] == "T":
						# State that probe seq passed in forward sense.
						ProbeValidPass = True
						# New Break For Loop
						break
	return ProbeValidPass


# Secondary Structure Filter
def SSIdentification(SeqOne, SeqTwo, ReverseOrientation, FixedBack=False ,threshold=4):

	# Nucleotide Dict
	NucDict = {'A':'T',
			   'T':'A',
			   'C':'G',
			   'G':'C',
				}

	# Maximum Binding sites
	MaxBindingSites = 0
	MaxBindingString = ''
	MaxBindingPercentage = 0

	# Hard Fail Sites
	MaxHardFailScore = 0
	HardFail = False
	HardFailString = ''

	# Max length of string according to shift.
	MaxLength = len(SeqOne) + len(SeqTwo) - 1

	# Add white space to end of sequences
	SeqOneNorm = SeqOne + ' '*(MaxLength-len(SeqOne))
	SeqTwoNorm = SeqTwo + ' '*(MaxLength-len(SeqTwo))
	SeqTwoReversed = SeqTwo[::-1] + ' '*(MaxLength-len(SeqTwo))


	if ReverseOrientation == True:
		CombosToCheck = [(SeqTwoNorm,SeqOneNorm),
						(SeqOneNorm,SeqTwoNorm),
						(SeqOneNorm,SeqTwoReversed),
						(SeqTwoReversed,SeqOneNorm)]
	else:
		CombosToCheck = [(SeqTwoNorm,SeqOneNorm),
						(SeqOneNorm,SeqTwoNorm)]


	# Loop through var pairs whereby the first element will be shifted
	for VarPair in CombosToCheck:

		# Loop through possible shift positions
		for i in list(range(VarPair[0].count(' ')+1)):


			if i == 0:
				DynamicSeq = VarPair[0]
			else:
				DynamicSeq = ' '*i + VarPair[0][:-i]

			DyanimicSeqList = list(DynamicSeq)
			FixedSeqList = list(VarPair[1])


			# This shall house the syntax to represent binding
			PossibleBindingString = ''

			for StringPos in list(range(len(DyanimicSeqList))):
				if DyanimicSeqList[StringPos] in list(NucDict.keys()):
					if NucDict[DyanimicSeqList[StringPos]] == FixedSeqList[StringPos]:
						PossibleBindingString += '|'
					else:
						PossibleBindingString += '-'
				else:
					PossibleBindingString += '-'

			NumberOfBindingMatches = PossibleBindingString.count('|')
			CompleteBindingString = DynamicSeq + '\n' + PossibleBindingString + '\n' + VarPair[1]


			# Fixed String
			if VarPair[1] == SeqOneNorm:
				FiveIndex = 0
				ThreeIndex = len(SeqOne)

			# Dyamic String
			else:
				FiveIndex = i
				ThreeIndex = i + len(SeqOne)


			UpperLimit = FiveIndex+22
			LowerLimit = ThreeIndex-22
			if LowerLimit <0:
				LowerLimit = 0


			# Determine 5 Prime Counts
			FivePrimeCounts = PossibleBindingString[FiveIndex:UpperLimit]

			# Determine 3 Prime Counts
			ThreePrimeCounts = PossibleBindingString[LowerLimit:ThreeIndex]


			weightings = [3,2,1.5]
			ThreePrimeLoc = [-1,-2,-3]
			FivePrimeLoc = [0,1,2]


			OriginalScore = [FivePrimeCounts,ThreePrimeCounts]
			IndexLocations = [FivePrimeLoc,ThreePrimeLoc]

			AdjustedWeights = []
			for ib in [0,1]:
				TempScore = OriginalScore[ib].count('|') - OriginalScore[ib].count('-')
				Indexes = IndexLocations[ib]
				for iz in list(zip(Indexes,weightings)):
					if OriginalScore[ib][iz[0]] == '|':
						TempScore += iz[1] - 1
				AdjustedWeights.append(TempScore)

			if max(AdjustedWeights) >= 21.5:
				HardFail = True
				HardFailString = CompleteBindingString


			if MaxBindingSites < NumberOfBindingMatches:
				MaxBindingSites = NumberOfBindingMatches
				MaxBindingString = CompleteBindingString

				if FixedBack == False:
					Denom = min([len(SeqOne),len(SeqTwo)])
				else:
					Denom = len(SeqOne)

				MaxBindingPercentage = (MaxBindingSites/Denom)*100

	return (MaxBindingPercentage,MaxBindingString, HardFail, HardFailString)


# Creating Alignment DF Multithread Function
def CreatingInputHomologyDF(fastadict,FastaIndexList):
	records = []
	for seqindex in FastaIndexList:
		TempNucleotides = []
		for faseq in fastadict.values():
			TempNucleotides.append(faseq[seqindex])

		# Remove NoN-specific Nucleotides = IUPAC designated the symbols for nucleotides
		TempNucleotides = [TN for TN in TempNucleotides if TN not in ['W','S','M',
																	  'K','R','Y',
																      'N']]
		if (len(TempNucleotides) == 2 and
			'-' in TempNucleotides):
			MostCommonN = '-'
			NRepresentation = -100 #Harsh weighting against splits
		else:
			BugFix = Counter(TempNucleotides)
			MostCommonN = max(TempNucleotides, key=BugFix.get)
			if MostCommonN == '-':
				NRepresentation = -100 #Harsh weighting against splits
			else:
				NRepresentation = TempNucleotides.count(MostCommonN)/len(TempNucleotides)

		records.append({'Index_Pos':seqindex,'Nucleotide':MostCommonN,'IdentityScore':NRepresentation})
	return pd.DataFrame.from_records(records)


def IndentifyingAndFilteringOligos(AllParameter,
								   AlignedDF,
								   PossibleStartIndexes):


	OligoDF = pd.DataFrame()
	TargetSiteLengths = [AllParameter.PrimerLength]
	if AllParameter.ProbeRequired in ['EXO','NFO']:
		TargetSiteLengths.append(AllParameter.ProbeLength)

	# Loop through primer or probe
	oligo_records = []
	for TSLP in TargetSiteLengths:
		if TSLP == AllParameter.PrimerLength:
			OligoType = 'Primer'
		else:
			OligoType = 'Probe'

		# Add in ability to handle specified range or primer or probe values.
		if '-' in str(TSLP):
			TSLRangePrior = [int(TSLI) for TSLI in TSLP.split('-')]
			TSLList = list(range(TSLRangePrior[0],TSLRangePrior[1]+1))

		else:
			TSLList = [int(TSLP)]

		# Loop through possible oligo lengths.
		for TSL in TSLList:

			# Loop through possible start sites
			for i in PossibleStartIndexes:

				# Make Sure dont go too far out of dataframe
				if i+TSL<len(AlignedDF)-TSL:

					DFSubSet = AlignedDF[(AlignedDF['Index_Pos']>=i)&(AlignedDF['Index_Pos']<=(i+TSL-1))]
					MeanHomologyScore = DFSubSet['IdentityScore'].mean()


					IDSList = DFSubSet['IdentityScore'].tolist()
					OutIDScore = []
					for IDSO in [IDSList,IDSList[::-1]]:
						IDScore = 0
						for SCI in list(range(len(IDSO))):
							if float(IDSO[SCI]) == 1.0:
								IDScore +=1
							else:
								break
						OutIDScore.append(IDScore)


					ThreePrimeIdentityScore = OutIDScore[1]
					FivePrimerIdentityScore = OutIDScore[0]


					if MeanHomologyScore >= AllParameter.IdentityThreshold:
						NucleotideSeq = ''.join(DFSubSet['Nucleotide'].tolist())
						NucleotideSeq = NucleotideSeq.upper()

						#Check Length of Oligo
						if len(NucleotideSeq) == TSL:

							# Assess GC Content
							GCPercentage = ((NucleotideSeq.count('G') + NucleotideSeq.count('C'))/len(NucleotideSeq))*100
							if (GCPercentage < AllParameter.MaxGC and GCPercentage > AllParameter.MinGC):

								# Assess Repeat Nucleotide
								NoRepeat = True
								for RROI in ['N','A','G','C','T']:
									RROIString = RROI * AllParameter.NucleotideRepeatLimit
									if RROIString in NucleotideSeq:
										NoRepeat = False

								if NoRepeat == True:

									# Run Secondary Structure Check / Self Dimerisation
									SDSSFilterPass=True
									MaxBindingSites, MaxBindingString, IgnoreThresh, IgnoreString = SSIdentification(NucleotideSeq,NucleotideSeq,True)
									if MaxBindingSites > AllParameter.DimerisationThresh:
										SDSSFilterPass=False


									if SDSSFilterPass == True:
										# Define row dictionary if passed all above stages
										RowDict = {'Oligo_Sequence':NucleotideSeq,
												   'Oligo_Type':OligoType,
												   'Binding_Site_Start_Index':i,
												   'Oligo_Length':TSL,
												   'Identity_Score':MeanHomologyScore,
												   'GC_Content': GCPercentage,
												   'Dimerisation Percentage Score':MaxBindingSites,
												   'Dimerisation String':MaxBindingString,
												   '3 prime conserved identity':ThreePrimeIdentityScore,
												   '5 prime conserved identity':FivePrimerIdentityScore}

										# Assess if Specific Probe Check is Needed.
										ProbePass = True
										if OligoType == 'Probe':

											# Run Specific Exo Probe Checks
											if AllParameter.ProbeRequired == 'EXO':
												ProbePass = RunFluroProbeAnalysis(NucleotideSeq)


										if ProbePass == True:

											# Add Run Blastn Check
											BlastnPass = True
											#If Background Check Neccessary Look To Change Blastn pass
											if AllParameter.BackgroundCheck.upper() != 'NO':

												# Run Blastn Check And If Passess Write Out Set
												MaxBackgroundScoreBindingScore, MaxScoreBackSeq, HardFailBool  = BlastnBackgroundCheck(NucleotideSeq, AllParameter)


												# Check to see if Max Binding Score Less Than Threshold
												if MaxBackgroundScoreBindingScore > AllParameter.CrossReactivityThresh:
													BlastnPass = False


												else:
													RowDict['Max Background Cross Reactivity Score'] = MaxBackgroundScoreBindingScore
													RowDict['Max Background Cross Reactivity SeqID'] = MaxScoreBackSeq


												## NEW NEW NEW ## - Check to see if oligo type is primer and then if it failed the hard-fail setting - Possible add parameter into this check as well
												if (HardFailBool == True and OligoType == 'Primer' and AllParameter.HardCrossReactFilter!='NO'):
													BlastnPass = False


											# If it passed Background Filtering
											if BlastnPass == True:
												oligo_records.append(RowDict)

	# Return Primer/Probe DataFrame
	return pd.DataFrame.from_records(oligo_records)


def ComboIdentifyier(PrimerSS,ReversePrimerSS,ProbeSS,AllParameter,PassedOligos,FPrimerL,RPrimerL,ProbeL):

	CombosList = []
	ps_records = []

	for FP in PrimerSS:

		# If Probe Required
		if AllParameter.ProbeRequired in ['EXO','NFO']:
			Probes = [prss for prss in ProbeSS if (prss >= (FP+FPrimerL) and
												   prss<= (FP+AllParameter.AmpliconSizeLimit-RPrimerL-ProbeL))]

			for Probe in Probes:
				ReversePrimers = [rpss for rpss in ReversePrimerSS if (rpss >= (Probe+ProbeL) and
																rpss <= (FP+AllParameter.AmpliconSizeLimit-RPrimerL))]

				CombosList += list(itertools.product([FP],ReversePrimers,[Probe]))

		# Probe Not Required
		else:
			ReversePrimers = [rpss for rpss in ReversePrimerSS if (rpss >= (FP+FPrimerL) and
															rpss <= (FP+AllParameter.AmpliconSizeLimit-RPrimerL))]

			CombosList += list(itertools.product([FP],ReversePrimers))


	# Extract Valid Primer-Probe Combos
	for Combo in CombosList:
		# Identify Indexes
		FPIndex = PassedOligos[(PassedOligos['Oligo_Type']=='Primer')&(PassedOligos['Oligo_Length']==FPrimerL)&(PassedOligos['Binding_Site_Start_Index']==Combo[0])].index.tolist()[0]
		RPIndex = PassedOligos[(PassedOligos['Oligo_Type']=='Primer')&(PassedOligos['Oligo_Length']==RPrimerL)&(PassedOligos['Binding_Site_Start_Index']==Combo[1])].index.tolist()[0]


		# Extract Combo Primer Sequences - (Reverse Complement RP)
		ComboSeqList = [PassedOligos.loc[FPIndex,'Oligo_Sequence'],getComplement(PassedOligos.loc[RPIndex,'Oligo_Sequence'],True)]

		# Add Probe Sequence If Necessary
		if AllParameter.ProbeRequired in ['EXO','NFO']:
			ProbeIndex = PassedOligos[(PassedOligos['Oligo_Type']=='Probe')&(PassedOligos['Oligo_Length']==ProbeL)&(PassedOligos['Binding_Site_Start_Index']==Combo[2])].index.tolist()[0]
			ComboSeqList.append(PassedOligos.loc[ProbeIndex,'Oligo_Sequence'])


		#Run Dimerisation Check And If Passess Continue
		DimerisationPass = True
		MaxComboSSScore = 0
		MaxComboSSString = ''
		for SSSCombo in list(itertools.combinations(ComboSeqList,r=2)):
			SSMaxBindingSites, SSMaxBindingString, IgnoreThresh, IgnoreString = SSIdentification(SSSCombo[0],SSSCombo[1],True)

			if SSMaxBindingSites > MaxComboSSScore:
				MaxComboSSScore = SSMaxBindingSites
				MaxComboSSString = SSMaxBindingString


		if MaxComboSSScore > AllParameter.DimerisationThresh:
			DimerisationPass = False

		if DimerisationPass == True:

			BlastnPass = True ### REMOVE INDEX AT LATER DATE


			# Add Everything To DF If Passed All Parameters
			if BlastnPass == True:

				PassedComboRow = {'Forward Primer (FP)':ComboSeqList[0],
								  'FP GC%':  PassedOligos.loc[FPIndex,'GC_Content'],
								  'FP Binding Start Site':  PassedOligos.loc[FPIndex,'Binding_Site_Start_Index'],
								  'Reverse Primer (RP)': ComboSeqList[1],
								  'RP GC%': PassedOligos.loc[RPIndex,'GC_Content'],
								  'RP Binding Start Site': PassedOligos.loc[RPIndex,'Binding_Site_Start_Index'],
								  'Amplicon Size': PassedOligos.loc[RPIndex,'Binding_Site_Start_Index'] + RPrimerL - PassedOligos.loc[FPIndex,'Binding_Site_Start_Index'],
								  'Max Dimerisation Percentage Score':SSMaxBindingSites,
								  'Max Dimerisation String':MaxComboSSString,
								  'Forward Primer Length':FPrimerL,
								  'Reverse Primer Length':RPrimerL,
								  "Minimum Primer 3' Identity Anchor": min([PassedOligos.loc[RPIndex,'5 prime conserved identity'], PassedOligos.loc[FPIndex,'3 prime conserved identity']])
								  }

				MaxBackgroundScoresIndexes = [FPIndex,RPIndex]

				if AllParameter.ProbeRequired in ['EXO','NFO']:
					PassedComboRow['Probe (P)'] = ComboSeqList[2]
					PassedComboRow['Probe GC%'] = PassedOligos.loc[ProbeIndex,'GC_Content']
					PassedComboRow['Probe Binding Start Site'] = PassedOligos.loc[ProbeIndex,'Binding_Site_Start_Index']
					PassedComboRow['Probe Length'] = ProbeL
					MaxBackgroundScoresIndexes.append(ProbeIndex)


				if AllParameter.BackgroundCheck != 'NO':

					MaxBackgroundScoreBindingScore = 0
					MaxScoreBackSeq = ''
					for SucIndex in MaxBackgroundScoresIndexes:
						if PassedOligos.loc[SucIndex,'Max Background Cross Reactivity Score'] > MaxBackgroundScoreBindingScore:
							MaxBackgroundScoreBindingScore = PassedOligos.loc[SucIndex,'Max Background Cross Reactivity Score']
							MaxScoreBackSeq = PassedOligos.loc[SucIndex,'Max Background Cross Reactivity SeqID']


					PassedComboRow['Max Background Cross Reactivity Score'] = MaxBackgroundScoreBindingScore
					PassedComboRow['Max Background Binding Seq'] = MaxScoreBackSeq


				# Add Passed Row To DataFrame
				ps_records.append(PassedComboRow)

	return pd.DataFrame.from_records(ps_records)

# Main Function To Generate Primer - Probe Combos
def CheckingAlignedOutputFile(AllParameter):


	# Check If Binding Sites Already Exist
	if AllParameter.PriorBindingSite != 'NO':

		print('Reading In Oligo Binding Sites')

		PassedOligos = pd.read_csv('{}'.format(AllParameter.PriorBindingSite))


		# Filter on Parameters Passed In
		PassedOligos = PassedOligos[(PassedOligos['GC_Content']>=AllParameter.MinGC)&
									(PassedOligos['GC_Content']<=AllParameter.MaxGC)&
									(PassedOligos['Dimerisation Percentage Score']<=AllParameter.DimerisationThresh)&
									(PassedOligos['Identity_Score']>=AllParameter.IdentityThreshold)]


		# Filter on Primer + Probe Ranges - (Tidy Up At Later Date)
		PPLengthDict = {}
		PrimerRange=False
		ProbeRange = False
		for LL in [AllParameter.PrimerLength,AllParameter.ProbeLength]:
			if LL == AllParameter.PrimerLength:
				LLType = 'Primer'
			else:
				LLType = 'Probe'

			if '-' in str(LL):
				LLRangePrior = [int(LLLI.strip()) for LLLI in LL.split('-')]
				PPLengthDict['{}_Max'.format(LLType)] = max(LLRangePrior)
				PPLengthDict['{}_Min'.format(LLType)] = min(LLRangePrior)
				if LLType == 'Primer':
					PrimerRange = True
				else:
					ProbeRange = True
			else:
				PPLengthDict['{}_Len'.format(LLType)] =  int(LL)


		if PrimerRange == True:
			PrimerPassedSubet = PassedOligos[(PassedOligos['Oligo_Type']=='Primer') &
											 (PassedOligos['Oligo_Length']>=PPLengthDict['Primer_Min']) &
											 (PassedOligos['Oligo_Length']<=PPLengthDict['Primer_Max']) ]
		else:
			PrimerPassedSubet = PassedOligos[(PassedOligos['Oligo_Type']=='Primer') &
											 (PassedOligos['Oligo_Length']==PPLengthDict['Primer_Len'])]


		if ProbeRange == True:
			ProbePassedSubet = PassedOligos[(PassedOligos['Oligo_Type']=='Probe') &
											 (PassedOligos['Oligo_Length']>=PPLengthDict['Probe_Min']) &
											 (PassedOligos['Oligo_Length']<=PPLengthDict['Probe_Max']) ]


		else:
			ProbePassedSubet = PassedOligos[(PassedOligos['Oligo_Type']=='Probe') &
											(PassedOligos['Oligo_Length']==PPLengthDict['Probe_Len'])]


		PassedOligos = pd.concat([PrimerPassedSubet,ProbePassedSubet]).reset_index().drop(['index'],axis=1)


		# If Background Check is Present filter
		if AllParameter.BackgroundCheck != 'NO':
			PassedOligos = PassedOligos[PassedOligos['Max Background Cross Reactivity Score']<=AllParameter.CrossReactivityThresh]


		##Check If No Primers Passed Subset
		if len(PrimerPassedSubet)==0:
			print('No Oligos Passed Filtering')
			sys.exit()

		if AllParameter.ProbeRequired!='NO':
			if len(ProbePassedSubet)==0:
				print('No Oligos Passed Filtering')
				sys.exit()


	#Create Binding DataFrame If It Doesnt Exist
	else:

		# Load In User Specified
		if AllParameter.PriorAlign != 'NO':
			print('Reading Alignment Summary')
			MergedAlignedDF = pd.read_csv('{}'.format(AllParameter.PriorAlign))
			HDFLST = list(range(len(MergedAlignedDF)))
			HomoDFInputIndexBlocks = [HDFLST[i:i + AllParameter.Threads] for i in range(0, len(HDFLST), AllParameter.Threads)]


		# Create Alignment DF if it doesnt exist
		else:
			print('Generating Alignment Summary')


			# Read in the aligned fasta sequences
			fastadict = FastaToDict(AllParameter.InputFile)

			# Extract Alignment Length and QC
			FirstSeq = True
			FastaSeqLength = 0
			for fastaseq in fastadict.values():
				if FirstSeq == True:
					FastaSeqLength = len(fastaseq)
					FirstSeq = False
				if len(fastaseq) != FastaSeqLength:
					sys.exit('ERROR: Alignment file error length of all sequences is not equal')


			# Generate Neccessary Alignment Summary DF
			HDFLST = list(range(FastaSeqLength))
			HomoDFInputIndexBlocks = [HDFLST[i:i + AllParameter.Threads] for i in range(0, len(HDFLST), AllParameter.Threads)]
			MTDFOVI = list(zip([fastadict]*len(HomoDFInputIndexBlocks),HomoDFInputIndexBlocks))

			with Pool(processes=AllParameter.Threads) as pool:
				AlignedDFMultiThreadOupt = pool.starmap(CreatingInputHomologyDF,MTDFOVI)
			MergedAlignedDF = pd.concat(AlignedDFMultiThreadOupt).reset_index().drop(['index'],axis=1)
			MergedAlignedDF = MergedAlignedDF.sort_values(by=['Index_Pos'])
			MergedAlignedDF.to_csv('{}_Alignment_Summary.csv'.format(AllParameter.RunID),index=None)


		print('Generating Primer/Probe Binding Site DataFrame')
		# Generating Primer/Probe Binding Site DataFrame
		PrimerProbeCheckParallelInput = list(zip([AllParameter]*len(HomoDFInputIndexBlocks),
												 [MergedAlignedDF]*len(HomoDFInputIndexBlocks),
												 HomoDFInputIndexBlocks))

		with Pool(processes=AllParameter.Threads) as pool:
			PotentialPrimerProbeOut = pool.starmap(IndentifyingAndFilteringOligos,PrimerProbeCheckParallelInput)


		PassedOligos =  pd.concat(PotentialPrimerProbeOut).reset_index().drop(['index'],axis=1)
		if len(PassedOligos) == 0:
			print('No Oligos Passed Filtering')
			sys.exit()
		PassedOligos.to_csv('{}_PrimedRPA_Oligo_Binding_Sites.csv'.format(AllParameter.RunID),index=None)


	# Identify Primer-Probe Sets
	print('Identifying Valid Primer-Probe Combinations')
	FinalOutputDF = pd.DataFrame()

	# Determine all probe lengths available
	if AllParameter.ProbeRequired in ['EXO','NFO']:
		PossibleProbeLengths = PassedOligos[PassedOligos['Oligo_Type']=='Probe'].loc[:,'Oligo_Length'].unique().tolist()

	else:
		PossibleProbeLengths = [0]

	# Loop through possible FP  lengths
	if AllParameter.ConservedAnchor == 'NO':
		AllParameter.ConservedAnchor = 0
	else:
		AllParameter.ConservedAnchor = int(AllParameter.ConservedAnchor)


	# Loop though Starting Forward Length
	for SFPL in PassedOligos[(PassedOligos['Oligo_Type']=='Primer')&(PassedOligos['3 prime conserved identity']>=AllParameter.ConservedAnchor)].loc[:,'Oligo_Length'].unique():

		# Identify all possible FP starting sites
		PossibleForwardPrimerSS = PassedOligos[(PassedOligos['Oligo_Type']=='Primer')&(PassedOligos['Oligo_Length']==SFPL)&(PassedOligos['3 prime conserved identity']>=AllParameter.ConservedAnchor)].loc[:,'Binding_Site_Start_Index'].tolist()

		# Loop through all possible RP lengths
		for SRPL in PassedOligos[(PassedOligos['Oligo_Type']=='Primer')&(PassedOligos['5 prime conserved identity']>=AllParameter.ConservedAnchor)].loc[:,'Oligo_Length'].unique():

			# Identify all possible RP starting sites
			PossibleReversePrimerSS = PassedOligos[(PassedOligos['Oligo_Type']=='Primer')&(PassedOligos['Oligo_Length']==SRPL)&(PassedOligos['5 prime conserved identity']>=AllParameter.ConservedAnchor)].loc[:,'Binding_Site_Start_Index'].tolist()

			# Split FP sites into tranches based on number of threads available.
			random.shuffle(PossibleForwardPrimerSS)
			PrimerStartSiteTranchesOne = [PossibleForwardPrimerSS[i:i + 2] for i in range(0, len(PossibleForwardPrimerSS), 2)]
			PrimerStartSiteTranchesTwo = [PrimerStartSiteTranchesOne[i:i + AllParameter.Threads] for i in range(0, len(PrimerStartSiteTranchesOne), AllParameter.Threads)]

			# Loop through possible probe lengths.
			for PPL in PossibleProbeLengths:

				if PPL == 0:
					PossibleProbeSS = []

				else:
					PossibleProbeSS = PassedOligos[(PassedOligos['Oligo_Type']=='Probe')&(PassedOligos['Oligo_Length']==PPL)].loc[:,'Binding_Site_Start_Index'].tolist()

				for PSST in PrimerStartSiteTranchesTwo:
					if len(FinalOutputDF) < AllParameter.MaxSets: ## Check this threshold setting.
						ComboSearchInput = list(zip(PSST,
												  [PossibleReversePrimerSS]*len(PSST),
												  [PossibleProbeSS]*len(PSST),
												  [AllParameter]*len(PSST),
												  [PassedOligos]*len(PSST),
												  [SFPL]*len(PSST),
												  [SRPL]*len(PSST),
												  [PPL]*len(PSST)))


						with Pool(processes=AllParameter.Threads) as pool:
							SuccessfulSets = pool.starmap(ComboIdentifyier,ComboSearchInput)

						TempOutputDF = pd.concat(SuccessfulSets)
						FinalOutputDF = pd.concat([FinalOutputDF,TempOutputDF])
						if len(FinalOutputDF) != 0:
							FinalOutputDF.to_csv('{}_Output_Sets.csv'.format(AllParameter.RunID),index=None)


	print('PrimedRPA Complete')
	print('If helpful, please cite:\n\nHiggins M et al. Submitted. 2018\nDOI: 10.1093/bioinformatics/bty701')


########################################################################################################################
########################################################################################################################
########################################################################################################################


print('\n\n')
print('-------------------------------------------')
print('----------------PrimedRPA------------------')
print('-----Finding RPA Primer and Probe Sets-----')
print('-------------Higgins M et al.--------------')
print('-------------------------------------------\n\n')


try:
	# Utilise Either Parameters File Or Command Line Interface
	if 'PrimedRPA_Parameters.txt' in sys.argv[1]:

		# Import and Extract Parameters
		parametersFile = sys.argv[1]
		try:
			paraFile = open(parametersFile,"r")
			iu = []
			for line in paraFile.readlines():
				if ">" in line:
					n = line.strip('\n')
					h = n.strip('>')
					iu.append(h)
			u = iu[1:]

			class AllParameter:
				RunID = str(u[0])
				PriorAlign = str(u[1])
				PriorBindingSite = str(u[2])
				InputFile = str(u[3])
				InputFileType = str(u[4])
				IdentityThreshold = int(u[5])/100
				ConservedAnchor = str(u[6]).strip()
				PrimerLength = u[7].strip()
				ProbeRequired = str(u[8]).upper()
				ProbeLength = u[9].strip()
				AmpliconSizeLimit = int(u[10])
				NucleotideRepeatLimit = int(u[11])
				MinGC = int(u[12])
				MaxGC = int(u[13])
				DimerisationThresh = int(u[14])
				BackgroundCheck = str(u[15])
				CrossReactivityThresh = int(u[16])
				HardCrossReactFilter = str(u[17])
				MaxSets = int(u[18])
				Threads = int(u[19])
				BackgroundSearchSensitivity = str(u[20])
				Evalue=int(u[21])


		except:
			print('Parameters File Could Not Be Opened\nCheck File Path.')
			sys.exit()

	else:

		parser = argparse.ArgumentParser()
		parser.add_argument('--RunID', help='Desired Run ID', required=True)
		parser.add_argument('--PriorAlign', help='Optional: Path to Prior Binding File',default='NO')
		parser.add_argument('--PriorBindingSite', help='Optional: Path to Prior Binding File',default='NO')
		parser.add_argument('--InputFile', help='Path to Input File',default='NO')
		parser.add_argument('--InputFileType', help='Options [SS,MS,MAS]')
		parser.add_argument('--IdentityThreshold', help='Desired Identity Threshold',default=0.99)
		parser.add_argument('--ConservedAnchor', help='Identity Anchor Required',type=str,default='NO')
		parser.add_argument('--PrimerLength', help='Desired Primer Length',type=str,default='30')
		parser.add_argument('--ProbeRequired', help='Options [NO,EXO,NFO]',type=str,default='NO')
		parser.add_argument('--ProbeLength', help='Desired Probe Length',type=str,default='50')
		parser.add_argument('--AmpliconSizeLimit', help='Amplicon Size Limit',type=int,default=500)
		parser.add_argument('--NucleotideRepeatLimit', help='Nucleotide Repeat Limit (i.e 5 = AAAAA)',type=int,default=5)
		parser.add_argument('--MinGC', help='Minimum GC Content',type=int,default=30)
		parser.add_argument('--MaxGC', help='Maximum GC Content',type=int,default=70)
		parser.add_argument('--DimerisationThresh', help='Percentage Dimerisation Tolerated',type=int,default=40)
		parser.add_argument('--BackgroundCheck', help='Options [NO, Path To Background Fasta File]',default='NO')
		parser.add_argument('--CrossReactivityThresh', help='Max Cross Reactivity Threshold',type=int,default=60)
		parser.add_argument('--HardCrossReactFilter', help='Hard Cross Reactivity Filter',type=str,default='NO')
		parser.add_argument('--MaxSets', help='Default Set To 100',type=int,default=100)
		parser.add_argument('--Threads', help='Default Set To 1',type=int,default=1)
		parser.add_argument('--BackgroundSearchSensitivity', help='Options [Basic or Advanced or Fast]',type=str,default='Basic')
		parser.add_argument('--Evalue', help='Default Set To 1000',type=int,default=1000)

		AllParameter = parser.parse_args()


except:
	print('Please run PrimedRPA --help to see valid options')
	sys.exit()


# Check Files Defined Exist
FileLocationsCheckList = [AllParameter.PriorAlign,
						  AllParameter.PriorBindingSite,
						  AllParameter.BackgroundCheck,
						  AllParameter.InputFile]

for FL in FileLocationsCheckList:
	if (FL != 'NO' and FL != ''):
		if FL not in glob.glob(FL):
			print('File Not Found: {}'.format(FL))
			sys.exit()


#Define tool dependancy paths
AllParameter.BlastnPath = 'blastn'
AllParameter.BlastnDBCreationPath = 'makeblastdb'
AllParameter.ClustalOPath = 'clustalo'
AllParameter.SamtoolsPath = 'samtools'


# Create Blastn Database if neccessary
if AllParameter.BackgroundCheck.upper() != 'NO':

	AllParameter.BlastnDBName =  '{}_Blastn_DB_PrimedRPA'.format(AllParameter.BackgroundCheck.split('/')[-1].split('.')[0])
	if os.path.exists(AllParameter.BlastnDBName) == False:
		os.makedirs(AllParameter.BlastnDBName)
		makeblastdbcommand = '{0} -in {1} -dbtype nucl -parse_seqids -out {2}/{2}'.format(AllParameter.BlastnDBCreationPath,
																						  AllParameter.BackgroundCheck,
																						  AllParameter.BlastnDBName)

		print('\nCreating Blastn Database:')
		subprocess.call([makeblastdbcommand],shell=True)
		print('\n')

	# Create directory to house blastn outputs
	AllParameter.PrimerBlastnOutput = '{}/{}'.format(AllParameter.BlastnDBName,AllParameter.RunID)
	if os.path.exists(AllParameter.PrimerBlastnOutput) == False:
		os.mkdir(AllParameter.PrimerBlastnOutput)


# Check If Alignment Is Necessary
if (AllParameter.InputFileType == 'MS' and AllParameter.InputFile != 'NO') :

	# Check if MS Alignment already Exists in Working Directory
	if AllParameter.InputFile.replace('.fasta','_Aligned.fasta') not in glob.glob(AllParameter.InputFile.replace('.fasta','_Aligned.fasta')):
		RunningClustalo1(AllParameter.ClustalOPath,[AllParameter.InputFile], overwriteOutput=False)

	AllParameter.InputFile = AllParameter.InputFile.replace('.fasta','_Aligned.fasta')


# Run Primer, Probe Design process
if __name__ == "__main__":
	CheckingAlignedOutputFile(AllParameter)