DGA域名指僵尸网络通过算法生成的随机性较高的域名,此类域名往往被攻击者用于构建自己的恶意软件基础设施,用于绕过安全产品的黑名单,从而规避安全设备的拦截以建立C2链接或DNS通道传输。
本小节使用alexa前1000域名(679个样本:label标记为0)作为白样本,使用dga-cryptolocker(1000个样本:label标记为1)和dga-tovar-goz(1000个样本:label标记为2)做为黑样本.
def load_alexa(filename): domain_list=[] csv_reader = csv.reader(open(filename)) for row in csv_reader: domain=row[1] if len(domain) >= MIN_LEN: domain_list.append(domain) return domain_list def load_dga(filename): domain_list=[] with open(filename) as f: for line in f: domain=line.split(",")[0] if len(domain) >= MIN_LEN: domain_list.append(domain) return domain_list def nb_dga(): x1_domain_list = load_alexa("../data/top-1000.csv") x2_domain_list = load_dga("../data/dga-cryptolocke-1000.txt") x3_domain_list = load_dga("../data/dga-post-tovar-goz-1000.txt") x_domain_list=np.concatenate((x1_domain_list, x2_domain_list,x3_domain_list)) y1=[0]*len(x1_domain_list) y2=[1]*len(x2_domain_list) y3=[2]*len(x3_domain_list) y=np.concatenate((y1, y2,y3))
本小节DGA域名使用2-gram分割域名,切割单元为字符(r='\w')并映射为向量,具体代码如下:
cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore", token_pattern=r"\w", min_df=1) x= cv.fit_transform(x_domain_list).toarray()
model=KMeans(n_clusters=2, random_state=random_state) y_pred = model.fit_predict(x)
使用TSNE将高维向量降维,其中DGA是使用x表示
tsne = TSNE(learning_rate=100) x=tsne.fit_transform(x) for i,label in enumerate(x): x1,x2=x[i] if y_pred[i] == 1: plt.scatter(x1, x2,marker='o') else: plt.scatter(x1, x2,marker='x') #plt.annotate(label,xy=(x1,x2),xytext=(x1,x2)) plt.show()
相比原作者提供的源码,新增了计算准确率的部分
# -*- coding:utf-8 -*- import numpy as np import csv import matplotlib.pyplot as plt from sklearn.feature_extraction.text import CountVectorizer from sklearn.cluster import KMeans from sklearn.manifold import TSNE #处理域名的最小长度 MIN_LEN=10 #随机程度 random_state = 170 def load_alexa(filename): domain_list=[] csv_reader = csv.reader(open(filename)) for row in csv_reader: domain=row[1] if len(domain) >= MIN_LEN: domain_list.append(domain) return domain_list def load_dga(filename): domain_list=[] #xsxqeadsbgvpdke.co.uk,Domain used by Cryptolocker - Flashback DGA for 13 Apr 2017,2017-04-13, # http://osint.bambenekconsulting.com/manual/cl.txt with open(filename) as f: for line in f: domain=line.split(",")[0] if len(domain) >= MIN_LEN: domain_list.append(domain) return domain_list def kmeans_dga(): x1_domain_list = load_alexa("../data/dga/top-100.csv") x2_domain_list = load_dga("../data/dga/dga-cryptolocke-50.txt") x3_domain_list = load_dga("../data/dga/dga-post-tovar-goz-50.txt") x_domain_list=np.concatenate((x1_domain_list, x2_domain_list,x3_domain_list)) #x_domain_list = np.concatenate((x1_domain_list, x2_domain_list)) y1=[0]*len(x1_domain_list) y2=[1]*len(x2_domain_list) y3=[1]*len(x3_domain_list) y=np.concatenate((y1, y2,y3)) #y = np.concatenate((y1, y2)) cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore", token_pattern=r"\w", min_df=1) x= cv.fit_transform(x_domain_list).toarray() model=KMeans(n_clusters=2, random_state=random_state) y_pred = model.fit_predict(x) tsne = TSNE(learning_rate=100) x=tsne.fit_transform(x) print(np.mean(y_pred == y) * 100) for i,label in enumerate(x): #print('index:', i, 'label:', label) x1,x2=x[i] if y_pred[i] == 1: plt.scatter(x1,x2,marker='o') else: plt.scatter(x1, x2,marker='x') #plt.annotate(label,xy=(x1,x2),xytext=(x1,x2)) plt.show() if __name__ == '__main__': kmeans_dga()
72.15189873417721
可视化如下
看起来效果不怎么地啊
测试仅区分正常数据与cryptolock家族的DGA域名,代码修改如下
def kmeans_dga(): x1_domain_list = load_alexa("../data/dga/top-100.csv") x2_domain_list = load_dga("../data/dga/dga-cryptolocke-50.txt") x3_domain_list = load_dga("../data/dga/dga-post-tovar-goz-50.txt") x_domain_list = np.concatenate((x1_domain_list, x2_domain_list)) y1=[0]*len(x1_domain_list) y2=[1]*len(x2_domain_list) y3=[1]*len(x3_domain_list) y = np.concatenate((y1, y2)) cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore", token_pattern=r"\w", min_df=1) x= cv.fit_transform(x_domain_list).toarray() model=KMeans(n_clusters=2, random_state=random_state) y_pred = model.fit_predict(x) tsne = TSNE(learning_rate=100) x=tsne.fit_transform(x) print(np.mean(y_pred == y) * 100) for i,label in enumerate(x): #print('index:', i, 'label:', label) x1,x2=x[i] if y_pred[i] == 1: plt.scatter(x1,x2,marker='o') else: plt.scatter(x1, x2,marker='x') plt.show()
测试结果如下所示,看起来也没有好到哪里去
82.4074074074074
可视化
将代码改为可配置组合,源码如下
# -*- coding:utf-8 -*- import numpy as np import csv import matplotlib.pyplot as plt from sklearn.feature_extraction.text import CountVectorizer from sklearn.cluster import KMeans from sklearn.manifold import TSNE #处理域名的最小长度 MIN_LEN=10 #随机程度 random_state = 170 def load_alexa(filename): domain_list=[] csv_reader = csv.reader(open(filename)) for row in csv_reader: domain=row[1] if len(domain) >= MIN_LEN: domain_list.append(domain) return domain_list def load_dga(filename): domain_list=[] #xsxqeadsbgvpdke.co.uk,Domain used by Cryptolocker - Flashback DGA for 13 Apr 2017,2017-04-13, # http://osint.bambenekconsulting.com/manual/cl.txt with open(filename) as f: for line in f: domain=line.split(",")[0] if len(domain) >= MIN_LEN: domain_list.append(domain) return domain_list def kmeans_dga(domain_x=123, pic_show=False): x1_domain_list = load_alexa("../data/dga/top-100.csv") x2_domain_list = load_dga("../data/dga/dga-cryptolocke-50.txt") x3_domain_list = load_dga("../data/dga/dga-post-tovar-goz-50.txt") y1=[0]*len(x1_domain_list) y2=[1]*len(x2_domain_list) y3=[1]*len(x3_domain_list) x_domain_list = np.concatenate((x1_domain_list, x2_domain_list, x3_domain_list)) y = np.concatenate((y1, y2, y3)) if domain_x ==12: x_domain_list = np.concatenate((x1_domain_list, x2_domain_list)) y = np.concatenate((y1, y2)) elif domain_x ==13: x_domain_list = np.concatenate((x1_domain_list, x3_domain_list)) y1 = [0] * len(x1_domain_list) y2 = [1] * len(x3_domain_list) y = np.concatenate((y1, y2)) elif domain_x == 23: x_domain_list = np.concatenate((x2_domain_list, x3_domain_list)) y1 = [0] * len(x2_domain_list) y2 = [1] * len(x3_domain_list) y = np.concatenate((y1, y2)) cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore", token_pattern=r"\w", min_df=1) x= cv.fit_transform(x_domain_list).toarray() model=KMeans(n_clusters=2, random_state=random_state) y_pred = model.fit_predict(x) score = np.mean(y_pred == y) * 100 print(domain_x, score) if pic_show: tsne = TSNE(learning_rate=100) x = tsne.fit_transform(x) for i,label in enumerate(x): x1,x2=x[i] if y_pred[i] == 1: plt.scatter(x1, x2, marker='o') else: plt.scatter(x1, x2, marker='x') #plt.annotate(label,xy=(x1,x2),xytext=(x1,x2)) plt.show() if __name__ == '__main__': kmeans_dga(domain_x=123) kmeans_dga(domain_x=12) kmeans_dga(domain_x=13) kmeans_dga(domain_x=23)
输出结果如下
123 72.15189873417721 12 82.4074074074074 13 33.33333333333333 23 60.0
结果都不怎么样,即便是分类y_pred预测为0或者1,怎么看效果都不怎么样。