1.python计算序列GC含量
参考http://rosalind.info/problems/gc/
import sys,os,re
def parse_fasta(s):
results = {}
strings = s.strip().split('>')
# Python split()通过指定分隔符对字符串进行切片,如果参数num 有指定值,则仅分隔 num 个子字符串
for s in strings:
if len(s) == 0:
continue
# 如果字符串长度为0,就跳出循环。
parts = s.split()
label = parts[0]
bases = ''.join(parts[1:])
results[label] = bases
return results
def gc_content(s):
n = len(s)
m = 0
for c in s:
if c == 'G' or c == 'C':
m += 1
return 100 * (float(m) / n)
if __name__ == "__main__":
input_file = open(sys.argv[1],"r")
large_dataset=input_file.read()
results = parse_fasta(large_dataset)
results = dict([(k, gc_content(v)) for k, v in results.items()])
# 这里iteritem()和item()功能是一样的
# 前一个results输出,名称+序列,后一个results输出,名称+百分比
print(results.items())
2.python根据序列表筛选序列
import sys
def usage():
print('Usage: python3 pick_up.py [fasta_file] [idlist_file] [outfile_name]')
def main():
outf = open(sys.argv[3],'w')
dict = {}
with open(sys.argv[1], 'r') as fastaf:
for line in fastaf:
if line.startswith('>'):
name = line.strip().split()[0][1:]
dict[name] = ''
else:
dict[name] += line.replace('\n','')
with open(sys.argv[2],'r') as listf:
for row in listf:
row = row.strip()
for key in dict.keys():
if key == row:
outf.write('>' + key+ '\n')
outf.write(dict[key] + '\n')
outf.close()
try:
main()
except IndexError:
usage()
统计序列长度
https://blog.csdn.net/tangxc10/article/details/48833989
#!/usr/bin/python
import sys,os,re
def process_file(reader):
'''Open, read,and print a file'''
names=[]
index=0
dict={}
for line in reader:
if line.startswith('>'):
if index >=1:
names.append(line)
index =index+1
name=line[:-1]
seq = ''
else:
seq +=line[:-1]
dict[name]=seq
return dict
if __name__ == "__main__":
input_file=open(sys.argv[1],"r")
reader=input_file.readlines()
items=process_file(reader)
for key in items:
length=int(len(items[key]))
print(key,length)
input_file.close()