In the previous post, I did not clarify the questions properly, therefore, I would like to start a new topic here.
I have the following items:
a sorted list of 59,000 protein patterns (range from 3 characters "FFK" to 152 characters long);
some long protein sequences, aka my reference.
I am going to match these patterns against my reference and find the location of where the match is found. (My friend helped wrtoe a script for that.)
import sys
import re
from itertools import chain, izip
# Read input
with open(sys.argv[1], 'r') as f:
sequences = f.read().splitlines()
with open(sys.argv[2], 'r') as g:
patterns = g.read().splitlines()
# Write output
with open(sys.argv[3], 'w') as outputFile:
data_iter = iter(sequences)
order = ['antibody name', 'epitope sequence', 'start', 'end', 'length']
header = '\t'.join([k for k in order])
outputFile.write(header + '\n')
for seq_name, seq in izip(data_iter, data_iter):
locations = [[{'antibody name': seq_name, 'epitope sequence': pattern, 'start': match.start()+1, 'end': match.end(), 'length': len(pattern)} for match in re.finditer(pattern, seq)] for pattern in patterns]
for loc in chain.from_iterable(locations):
output = '\t'.join([str(loc[k]) for k in order])
outputFile.write(output + '\n')
f.close()
g.close()
outputFile.close()
Problem is, within these 59,000 patterns, after sorted, I found that some part of one pattern match with part of the other patterns, and I would like to consolidate these into one big "consensus" patterns and just keep the consensus (see examples below):
TLYLQMNSLRAED
TLYLQMNSLRAEDT
YLQMNSLRAED
YLQMNSLRAEDT
YLQMNSLRAEDTA
YLQMNSLRAEDTAV
will yield
TLYLQMNSLRAEDTAV
another example:
APRLLIYGASS
APRLLIYGASSR
APRLLIYGASSRA
APRLLIYGASSRAT
APRLLIYGASSRATG
APRLLIYGASSRATGIP
APRLLIYGASSRATGIPD
GQAPRLLIY
KPGQAPRLLIYGASSR
KPGQAPRLLIYGASSRAT
KPGQAPRLLIYGASSRATG
KPGQAPRLLIYGASSRATGIPD
LLIYGASSRATG
LLIYGASSRATGIPD
QAPRLLIYGASSR
will yield
KPGQAPRLLIYGASSRATGIPD
PS : I am aligning them here so it's easier to visualize. The 59,000 patterns initially are not sorted so it's hard to see the consensus in the actual file.
In my particular problem, I am not picking the longest patterns, instead, I need to take each pattern into account to find the consensus. I hope I have explained clearly enough for my specific problem.
Thanks!
Here's my solution with randomized input order to improve confidence of the test.
import re
import random
data_values = """TLYLQMNSLRAED
TLYLQMNSLRAEDT
YLQMNSLRAED
YLQMNSLRAEDT
YLQMNSLRAEDTA
YLQMNSLRAEDTAV
APRLLIYGASS
APRLLIYGASSR
APRLLIYGASSRA
APRLLIYGASSRAT
APRLLIYGASSRATG
APRLLIYGASSRATGIP
APRLLIYGASSRATGIPD
GQAPRLLIY
KPGQAPRLLIYGASSR
KPGQAPRLLIYGASSRAT
KPGQAPRLLIYGASSRATG
KPGQAPRLLIYGASSRATGIPD
LLIYGASSRATG
LLIYGASSRATGIPD
QAPRLLIYGASSR"""
test_li1 = data_values.split()
#print(test_li1)
test_li2 = ["abcdefghi", "defghijklmn", "hijklmnopq", "mnopqrst", "pqrstuvwxyz"]
def aggregate_str(data_li):
copy_data_li = data_li[:]
while len(copy_data_li) > 0:
remove_li = []
len_remove_li = len(remove_li)
longest_str = max(copy_data_li, key=len)
copy_data_li.remove(longest_str)
remove_li.append(longest_str)
while len_remove_li != len(remove_li):
len_remove_li = len(remove_li)
for value in copy_data_li:
value_pattern = "".join([x+"?" for x in value])
longest_match = max(re.findall(value_pattern, longest_str), key=len)
if longest_match in value:
longest_str_index = longest_str.index(longest_match)
value_index = value.index(longest_match)
if value_index > longest_str_index and longest_str_index > 0:
longest_str = value[:value_index] + longest_str
copy_data_li.remove(value)
remove_li.append(value)
elif value_index < longest_str_index and longest_str_index + len(longest_match) == len(longest_str):
longest_str += value[len(longest_str)-longest_str_index:]
copy_data_li.remove(value)
remove_li.append(value)
elif value in longest_str:
copy_data_li.remove(value)
remove_li.append(value)
print(longest_str)
print(remove_li)
random.shuffle(test_li1)
random.shuffle(test_li2)
aggregate_str(test_li1)
#aggregate_str(test_li2)
Output from print().
KPGQAPRLLIYGASSRATGIPD
['KPGQAPRLLIYGASSRATGIPD', 'APRLLIYGASS', 'KPGQAPRLLIYGASSR', 'APRLLIYGASSRAT', 'APRLLIYGASSR', 'APRLLIYGASSRA', 'GQAPRLLIY', 'APRLLIYGASSRATGIPD', 'APRLLIYGASSRATG', 'QAPRLLIYGASSR', 'LLIYGASSRATG', 'KPGQAPRLLIYGASSRATG', 'KPGQAPRLLIYGASSRAT', 'LLIYGASSRATGIPD', 'APRLLIYGASSRATGIP']
TLYLQMNSLRAEDTAV
['YLQMNSLRAEDTAV', 'TLYLQMNSLRAED', 'TLYLQMNSLRAEDT', 'YLQMNSLRAED', 'YLQMNSLRAEDTA', 'YLQMNSLRAEDT']
Edit1 - brief explanation of the code.
1.) Find longest string in list
2.) Loop through all remaining strings and find longest possible match.
3.) Make sure that the match is not a false positive. Based on the way I've written this code, it should avoid pairing single overlaps on terminal ends.
4.) Append the match to the longest string if necessary.
5.) When nothing else can be added to the longest string, repeat the process (1-4) for the next longest string remaining.
Edit2 - Corrected unwanted behavior when treating data like ["abcdefghijklmn", "ghijklmZopqrstuv"]
def main():
#patterns = ["TLYLQMNSLRAED","TLYLQMNSLRAEDT","YLQMNSLRAED","YLQMNSLRAEDT","YLQMNSLRAEDTA","YLQMNSLRAEDTAV"]
patterns = ["APRLLIYGASS","APRLLIYGASSR","APRLLIYGASSRA","APRLLIYGASSRAT","APRLLIYGASSRATG","APRLLIYGASSRATGIP","APRLLIYGASSRATGIPD","GQAPRLLIY","KPGQAPRLLIYGASSR","KPGQAPRLLIYGASSRAT","KPGQAPRLLIYGASSRATG","KPGQAPRLLIYGASSRATGIPD","LLIYGASSRATG","LLIYGASSRATGIPD","QAPRLLIYGASSR"]
test = find_core(patterns)
test = find_pre_and_post(test, patterns)
#final = "YLQMNSLRAED"
final = "KPGQAPRLLIYGASSRATGIPD"
if test == final:
print("worked:" + test)
else:
print("fail:"+ test)
def find_pre_and_post(core, patterns):
pre = ""
post = ""
for pattern in patterns:
start_index = pattern.find(core)
if len(pattern[0:start_index]) > len(pre):
pre = pattern[0:start_index]
if len(pattern[start_index+len(core):len(pattern)]) > len(post):
post = pattern[start_index+len(core):len(pattern)]
return pre+core+post
def find_core(patterns):
test = ""
for i in range(len(patterns)):
for j in range(2,len(patterns[i])):
patterncount = 0
for pattern in patterns:
if patterns[i][0:j] in pattern:
patterncount += 1
if patterncount == len(patterns):
test = patterns[i][0:j]
return test
main()
So what I do first is find the main core in the find_core
function by starting with a string of length two, as one character is not sufficient information, for the first string. I then compare that substring and see if it is in ALL the strings as the definition of a "core"
I then find the indexes of the substring in each string to then find the pre and post substrings added to the core. I keep track of these lengths and update them if one length is greater than the other. I didn't have time to explore edge cases so here is my first shot