1、测试数据下载:ftp://ftp.ensemblgenomes.org/pub/plants/release-44/gff3/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.44.chromosome.1.gff3.gz
2、
[root@PC1 test2]# ls Arabidopsis_thaliana.TAIR10.44.chromosome.1.gff3.gz [root@PC1 test2]# gunzip Arabidopsis_thaliana.TAIR10.44.chromosome.1.gff3.gz [root@PC1 test2]# ls Arabidopsis_thaliana.TAIR10.44.chromosome.1.gff3 [root@PC1 test2]# mv Arabidopsis_thaliana.TAIR10.44.chromosome.1.gff3 a.txt ## 测试数据 [root@PC1 test2]# ls a.txt
3、
[root@PC1 test2]# ls a.txt test.py [root@PC1 test2]# cat test.py ## 提取信息脚本 in_file = open("a.txt", "r") out_file = open("result.txt", "w") for i in in_file: i = i.strip() if i.startswith("#"): continue else: tmp = i.split("\t") if int(tmp[0]) == 1 and tmp[2] == "gene" and int(tmp[3]) > 100000 and int(tmp[4]) < 500000: gene = tmp[8].split(";")[0].split("=")[1] final = tmp[0] + "\t" + tmp[3] + "\t" + tmp[4] + "\t" + gene out_file.write(final + "\n") in_file.close() out_file.close() [root@PC1 test2]# python test.py ## 运行程序 [root@PC1 test2]# ls a.txt result.txt test.py [root@PC1 test2]# head result.txt ## 查看结果 1 104440 105330 gene:AT1G01250 1 108946 111699 gene:AT1G01260 1 112263 113947 gene:AT1G01280 1 114202 116407 gene:AT1G01290 1 116784 118845 gene:AT1G01300 1 119381 119997 gene:AT1G01305 1 120154 121130 gene:AT1G01310 1 121067 130577 gene:AT1G01320 1 130736 130858 gene:AT1G01335 1 132270 135924 gene:AT1G01340