以下代码为个人原创,python实现,是处理PDB文件的部分常用代码,仅供参考!
1.下载PDB文件 下面是一个下载PDB文件的函数,传入的参数是一个写有pdb名字的namefile文件,函数的核心部分是三个系统命令,先通过wget下载,然后解压,最后替换名字。
1 2 3 4 5 6 7 def downloadpdb (namefile) : inputfile = open(namefile, 'r' ) for eachline in inputfile: pdbname = eachline.lower().strip() os.system("wget http://ftp.wwpdb.org/pub/pdb/data/structures/all/pdb/pdb" + pdbname + ".ent.gz" ) os.system("gzip -d pdb" + pdbname + '.ent.gz' ) os.system("mv pdb" + pdbname + ".ent " + pdbname.upper() + '.pdb' )
测试用例
1 2 os.chdir('/ifs/home/liudiwei/datasets/RPdatas' ) downloadpdb('protein.name' )
2.PDB转DSSP 将下载的PDB文件转成DSSP文件
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 def formatdsspline (dsspline) : eachline = dsspline col = '\t' + eachline[0 :5 ] col += '\t' + eachline[5 :10 ] col += '\t' + eachline[10 :12 ] col += '\t' + eachline[12 :15 ] col += '\t' + eachline[15 :25 ] col += '\t' + eachline[25 :39 ] col += '\t' + eachline[29 :34 ] col += '\t' + eachline[34 :38 ] col += '\t' + eachline[38 :50 ] col += '\t' + eachline[50 :61 ] col += '\t' + eachline[61 :72 ] col += '\t' + eachline[72 :83 ] col += '\t' + eachline[83 :92 ] col += '\t' + eachline[92 :97 ] col += '\t' + eachline[97 :103 ] col += '\t' + eachline[103 :109 ] col += '\t' + eachline[109 :115 ] col += '\t' + eachline[115 :122 ] col += '\t' + eachline[122 :129 ] col += '\t' + eachline[129 :136 ] return col
PDB转DSSP格式,需要DSSP软件
参数:
pdbdir: pdb文件目录
dsspdir: 生成的dssp文件目录(需创建)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 def pdbToDSSP (pdbnamefile,pdbdir, dsspdir) : pdbfiles = os.listdir(pdbdir) for pdb_file in pdbfiles: pdb_name = pdb_file.split('.' )[0 ].upper() command = 'DSSPCMBI.EXE -x ' + pdbdir +'/' + pdb_file + ' ' + dsspdir +"/" + pdb_name +'.dssp' os.system(command) dsspfiles = os.listdir(dsspdir) if os.path.exists(dsspdir + "/DSSP" ): dsspfiles.remove("DSSP" ) output=open(dsspdir + '/DSSP' ,'w' ) with open(pdbnamefile, 'r' ) as namefile: for eachline in namefile: pdb_name = eachline.strip() dssp_file = pdb_name + '.dssp' with open(dsspdir + '/' + dssp_file,"r" ) as f: if not os.path.isdir(dsspdir + '/format' ): os.mkdir(dsspdir + '/format' ) with open(dsspdir + '/format/' + pdb_name + '.dssp.format' ,'w' ) as singleOut: count = 0 ; preRes=[] sets = set('' );content='' for eachline in f.readlines(): list1=[];oneline=[] count+=1 list1.append(pdb_name) if count >= 29 : eachline = formatdsspline(eachline) oneline = eachline.split('\t' ) if oneline[3 ].strip(): preRes = oneline[3 ].strip() list1.append(eachline) content += "" .join(list1)+'\n' if '!' == oneline[4 ].strip(): continue if '!*' in eachline or not oneline[3 ].strip(): if preRes in sets and len(sets): content='' preRes=[] continue sets.add(preRes) output.write(content) singleOut.write(content) content='' if preRes and preRes not in sets: output.write(content) singleOut.write(content) output.close()
测试
1 2 3 4 5 pdbdir = 'z:/datasets/protein/pdb' dsspdir = 'Z:/datasets/protein/DSSPdir' proname = 'Z:/datasets/protein/protein.name' pdbToDSSP(proname,pdbdir, dsspdir)
3.DSSP抽取序列 从一个整合的DSSP文件中抽取序列文件
从格式化后的dssp文件获取序列信息
参数:dsspfile为格式过的DSSP文件,seqfile为输出的序列文件,同时输出序列文件
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 def getSeqFromDSSP (dsspfile, seqfile, minLen) : with open(dsspfile, 'r' ) as inputfile: if not seqfile.strip(): seqfile = 'protein' +minLen + '.dssp.seq' outchain = open('protein40.chain.all' , 'w' ) with open(seqfile, 'w' ) as outputfile: residue=[];Ntype=[] preType=[];preRes=[] firstline=[];secondline=[];content='' for eachline in inputfile: oneline = eachline.split('\t' ) residue = oneline[0 ] if not residue.strip(): continue Ntype = oneline[3 ].strip() if not Ntype.strip(): continue if preRes!=residue: content = '' .join(firstline)+'\n' + '' .join(secondline) +'\n' if len(secondline) >=minLen and not 'X' in secondline: outchain.write('' .join(firstline) + '\n' ) outputfile.write(content) firstline=[] firstline.append('>' + residue + ':' + Ntype) secondline=[];secondline.append(oneline[4 ].strip()) preRes = residue;preType = Ntype continue if Ntype != preType: content = '' .join(firstline)+'\n' + '' .join(secondline) +'\n' if len(secondline) >=minLen and not 'X' in secondline: outchain.write('' .join(firstline) + '\n' ) outputfile.write(content) firstline=[] firstline.append('>' + residue + ':' + Ntype) secondline=[];secondline.append(oneline[4 ].strip()) preRes = residue;preType = Ntype else : secondline.append(oneline[4 ].strip()) content = '' .join(firstline)+'\n' + '' .join(secondline) +'\n' if len(secondline) >=minLen and not 'X' in secondline: outchain.write('' .join(firstline) + '\n' ) outputfile.write(content) outchain.close()
测试:
1 2 3 4 os.chdir(r"E:\3-CSU\Academic\_Oriented\analysis\experiment\Datasets\ptest.dssp" ) pdbfile = 'DSSP' outfile = 'protein.seq' getSeqFromDSSP(pdbfile,outfile)
4.对序列做blast聚类 设置相应的参数,在服务器上跑blast,代码如下:
1 ifs/share/lib/cd-hit-v4.5 .4 /cd-hit -i /ifs/home/liudiwei/datasets/protein40.seq -o /ifs/home/liudiwei/experiment/cdhit/fasta.40 -c 0.4 -n 2 -M 2000
5.生成聚类后的DSSP,得到protein.name、protein.seq、protein.chain三个文件 从原来的DSSP文件中,根据聚类后的链名抽取新的DSSP文件
1 2 3 4 5 6 7 8 9 10 11 12 13 14 def extractDSSP (dsspfile, chainname, outfile) : with open(outfile, 'w' ) as outdssp: with open(dsspfile, 'r' ) as inputdssp: for eachline in inputdssp: oneline = eachline.split('\t' ) with open(chainname,'r' ) as chainfile: for eachchain in chainfile: protein_ame = eachchain[1 :5 ] chain_id = eachchain[6 :7 ] if oneline[0 ].strip() == protein_ame and oneline[3 ].strip() == chain_id: outdssp.write(eachline) break
测试实例:
1 2 3 4 dsspfile = '/ifs/home/liudiwei/datasets/832.protein/DSSPdir1/DSSP' chainname = '/ifs/home/liudiwei/experiment/step1/832p.cluster/cdhit/protein.chain' outfile = '/ifs/home/liudiwei/experiment/step1/832p.cluster/cdhit/DSSP' extractDSSP(dsspfile, chainname, outfile )