LOF(Local Outlier Factor)算法是基于密度的异常点检测算法,适合于高维数据检测。
核心思想
离群点处的密度应该较邻域内其他点的密度小。
基本概念
k距离:对于点p,将其他点与之距离进行从小到大排序,第k个即为k距离
k距离邻域:到点p的距离小于等于k距离点,共k个
可达距离:若到点p的实际距离小于k距离,则为k距离,反之为实际距离
局部可达密度:邻域内点到p点可达距离平均值的倒数。(注意方向不要搞反)
局部离群因子:领域内点的局部可达密度的均值除以p点的局部可达密度
局部离群因子(LOF)的大小代表该点为离群点的可信度。即因子越大,该点越可能是离群点。
from scipy.spatial.distance import cdist import numpy as np class LOF: def __init__(self, data, k, epsilon=1.0): self.data = data self.k = k self.epsilon = epsilon self.N = self.data.shape[0] def get_dist(self): # 计算欧式距离矩阵 return cdist(self.data, self.data) def _kdist(self, arr): # 计算k距离 inds_sort = np.argsort(arr) neighbor_ind = inds_sort[1:self.k + 1] # 邻域内点索引 return neighbor_ind, arr[neighbor_ind[-1]] def get_rdist(self): # 计算可达距离 dist = self.get_dist() nei_kdist = np.apply_along_axis(self._kdist, 1, dist) nei_inds, kdist = zip(*nei_kdist) for i, k in enumerate(kdist): ind = np.where(dist[i] < k) # 实际距离小于k距离,则可达距离为k距离 dist[i][ind] = k return nei_inds, dist def get_lrd(self, nei_inds, rdist): # 计算局部可达密度 lrd = np.zeros(self.N) for i, inds in enumerate(nei_inds): s = 0 for j in inds: s += rdist[j, i] lrd[i] = self.k / s return lrd def run(self): # 计算局部离群因子 nei_inds, rdist = self.get_rdist() lrd = self.get_lrd(nei_inds, rdist) score = np.zeros(self.N) for i, inds in enumerate(nei_inds): N = len(inds) lrd_nei = sum(lrd[inds]) score[i] = lrd_nei / self.k / lrd[i] return score, np.where(score > self.epsilon)[0] if __name__ == '__main__': np.random.seed(42) X_inliers = 0.3 * np.random.randn(100, 2) X_inliers = np.r_[X_inliers + 2, X_inliers - 2] X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2)) data = np.r_[X_inliers, X_outliers] lof = LOF(data, 5, epsilon=1.2) score, out_ind = lof.run() outliers = data[out_ind] import matplotlib.pyplot as plt plt.scatter(data[:, 0], data[:, 1], color='b') plt.scatter(outliers[:, 0], outliers[:, 1], color='r') plt.show()