Bootstrap

python 处理 多序列总结

1.python计算序列GC含量

参考http://rosalind.info/problems/gc/

import sys,os,re

def parse_fasta(s):
    results = {}
    strings = s.strip().split('>')
    # Python split()通过指定分隔符对字符串进行切片,如果参数num 有指定值,则仅分隔 num 个子字符串

    for s in strings:
        if len(s) == 0:
            continue
            # 如果字符串长度为0,就跳出循环。

        parts = s.split()
        label = parts[0]
        bases = ''.join(parts[1:])

        results[label] = bases

    return results


def gc_content(s):
    n = len(s)
    m = 0

    for c in s:
        if c == 'G' or c == 'C':
            m += 1

    return 100 * (float(m) / n)


if __name__ == "__main__":
     input_file  = open(sys.argv[1],"r")
     large_dataset=input_file.read()
     results = parse_fasta(large_dataset)
     results = dict([(k, gc_content(v)) for k, v in results.items()])
    # 这里iteritem()和item()功能是一样的
    # 前一个results输出,名称+序列,后一个results输出,名称+百分比

     print(results.items())

2.python根据序列表筛选序列

import sys

def usage():
    print('Usage: python3 pick_up.py [fasta_file] [idlist_file] [outfile_name]')


def main():
    outf = open(sys.argv[3],'w')
    dict = {}
    with open(sys.argv[1], 'r') as fastaf:
        for line in fastaf:
            if line.startswith('>'):
                name = line.strip().split()[0][1:]
                dict[name] = ''
            else:
                dict[name] += line.replace('\n','')

    with open(sys.argv[2],'r') as listf:
        for row in listf:
            row = row.strip()
            for key in dict.keys():
                if key == row:
                    outf.write('>' + key+ '\n')
                    outf.write(dict[key] + '\n')
    outf.close()


try:
    main()
except IndexError:
    usage()

统计序列长度

https://blog.csdn.net/tangxc10/article/details/48833989

#!/usr/bin/python
import sys,os,re


def process_file(reader):
    '''Open, read,and print a file'''
    names=[]
    index=0
    dict={}


    for line in reader:
        if line.startswith('>'):
           if index >=1:
               names.append(line)
           index =index+1
           name=line[:-1]
           seq = ''
        else:
           seq +=line[:-1]
           dict[name]=seq
    return dict


if __name__ == "__main__":
    input_file=open(sys.argv[1],"r")
    reader=input_file.readlines()
    items=process_file(reader)
    for key in items:
        length=int(len(items[key]))
        print(key,length)
    input_file.close()

;