在入侵检测业务中,针对文件内容型的告警详情,需要匹配命中内容的上下若干行。
先来看一种实现:
/** * Created by zhangli on 19-12-18. * 高亮文本工具类 */ public class HighLightUtils { private static final Integer LINE_NUM = 10; private static final int MAX_REGEX_NUM = 10; /** * @param content 文本内容 * @param keywords 关键字列表 * @return 高亮内容段落集 */ public static List<MatchedContent> highlight(String content, List<String> keywords) { if (StringUtils.isEmpty(content) || CollectionUtils.isEmpty(keywords)) { return Collections.emptyList(); } List<MatchedContent> partContentList = Lists.newArrayList(); for (String keyword : keywords) { if (!content.contains(keyword)) { continue; } partContentList.addAll(highlight(content, escapeRegexSpecialWord(keyword))); } return partContentList; } /** * @param content 文本内容 * @param regex 正则表达式 * @return 高亮内容段落集 */ public static List<MatchedContent> highlight(String content, String regex) { return highlight(content, regex, MAX_REGEX_NUM, LINE_NUM); } public static List<MatchedContent> highlight(String content, String regex, int maxMatchNum, int lineNum) { if (StringUtils.isEmpty(content) || StringUtils.isEmpty(regex)) { return Collections.emptyList(); } content = content.replaceAll("\\r\\n", "\n"); Pattern pattern = Pattern.compile(regex); Matcher m = pattern.matcher(content); List<MatchedContent> partContentList = Lists.newArrayList(); int maxNum = maxMatchNum; while (m.find()) { RegexMatchPoint regexMatchPoint = new RegexMatchPoint(m.start(), m.end()); partContentList.add(getPartContentMap(content, regexMatchPoint, lineNum)); if (--maxNum == 0) { break; } } return partContentList; } /** * 根据正则匹配获取高亮内容及起始行 */ private static MatchedContent getPartContentMap(String content, RegexMatchPoint m, int lineNum) { // 获取匹配内容在文件中的行数 int startMatchLine = content.substring(0, m.getStart()).split("\\n").length; int endMatchLine = content.substring(0, m.getEnd()).split("\\n").length; // 高亮文件匹配内容 String highlightContent = highlightOneRegexContent(content, m); // 截取匹配内容前后共20行(若匹配内容跨行,且大于10行,则从匹配的地方开始截取) String partContent = getPartContent(highlightContent, startMatchLine, endMatchLine); // 获取截取内容首行的行号 int startLine = endMatchLine - lineNum + 1; //如果匹配的内容大于10行,则从最初匹配行开始,而不是固定的10行 if (startMatchLine < startLine) { startLine = startMatchLine; } return MatchedContent.builder() .startLine(startLine < 1 ? 1 : startLine) .partContent(partContent) .build(); } /** * 获取高亮行前后部分内容 */ private static String getPartContent(String content, Integer startMatchLine, Integer endMatchLine) { int start = StringUtils.ordinalIndexOf(content, "\n", endMatchLine - LINE_NUM); if (endMatchLine - startMatchLine > LINE_NUM) { start = StringUtils.ordinalIndexOf(content, "\n", startMatchLine - 1); } start = start < 0 ? 0 : start + 1; int end = StringUtils.ordinalIndexOf(content, "\n", endMatchLine + LINE_NUM); end = end < 0 ? content.length() : end; return content.substring(start, end); } /** * 高亮单个匹配的内容 */ private static String highlightOneRegexContent(String content, RegexMatchPoint point) { int start = 0; StringBuffer highlightContentSb = new StringBuffer(); highlightContentSb.append(content.substring(start, point.getStart())).append(CommonValues.HIGH_LIGHT_START) .append(content.substring(point.getStart(), point.getEnd())).append(CommonValues.HIGH_LIGHT_END) .append(content.substring(point.getEnd())); return highlightContentSb.toString(); } private static String escapeRegexSpecialWord(String keyword) { if (keyword != "") { String[] fbsArr = { "\\", "$", "(", ")", "*", "+", ".", "[", "]", "?", "^", "{", "}", "|" }; for (String key : fbsArr) { if (keyword.contains(key)) { keyword = keyword.replace(key, "\\" + key); } } } return keyword; } @Setter @Getter @ToString public static class RegexMatchPoint implements Comparable<RegexMatchPoint> { private Integer start; private Integer end; public RegexMatchPoint(Integer start, Integer end) { this.start = start; this.end = end; } //按开始位置排序 @Override public int compareTo(RegexMatchPoint o) { if (start.compareTo(o.getStart()) == 0) { return end.compareTo(o.getEnd()); } else { return start.compareTo(o.getStart()); } } public RegexMatchPoint copy() { return new RegexMatchPoint(start, end); } } }
这个实现还是不错的,至少给人很好的启发,是一个很好的改进基础。
那么,它的问题在哪里呢?
何为构造与使用分离呢? 是指构造的时候,就提取出足够的必要信息; 而在使用时则运用这些信息去处理,而不是“边构建边使用”。就像编译器做代码编译和自动生成一样,应该不会是边编译边生成代码。
边构建边使用的实现,会将构建与处理耦合在一起,一旦有需要改动,就会比较困难。
很显然,如果要构造与使用分离,那么我们需要首先拿到什么内容? (命中内容的行号、起始位置、结束位置;所有文件行及行号) 应该先把这些必要信息提取出来。一旦我们确定了求解问题需要的必要信息,想出一个清晰的算法就比较自然了。
步骤一:获取所有行及行号【行号,行内容】;
步骤二: 先找到所有匹配正则的字符串的行号及起始结束点 regexMatchPoints =(lineNo, start, end);
步骤三:将 regexMatchPoints 按行分组;因为行内的多个匹配合并很麻烦;
步骤四:所有匹配行号,按匹配行号排序,方便最终按行号序展示;
步骤五:按行生成高亮内容展示[行号,高亮行内容];
步骤六:按匹配行号计算起始行和结束行行号,如果已经在这个区间的行号则可以过滤(合并实现);
步骤七:根据所有起始行号和结束行号获取对应的行内容。
/** * 高亮文本展示工具类 * Created by qinshu on 2021/12/31 */ public class HighLightUtil { private static final Logger LOG = LogUtils.getLogger(HighLightUtil.class); /** 高亮展示前后的行数 */ private static final Integer HIGHLIGHT_LINE_NUM = 5; /** 最大匹配多少次 */ private static final int MAX_REGEX_NUM = 10; /** * @param content 文本内容 * @param regex 正则表达式 * @return 高亮内容段落集 */ public static List<MatchedFileContent> highlight(String content, String regex) { return highlight(content, regex, MAX_REGEX_NUM, HIGHLIGHT_LINE_NUM); } /** * @param base64Content 文本内容(base64编码后的文本) * @param regex 正则表达式 * @return 高亮内容段落集 */ public static List<MatchedFileContent> highlightBase64(String base64Content, String regex) { if (StringUtils.isEmpty(base64Content)) { return Collections.emptyList(); } return highlight(Base64Utils.decodeContent(base64Content), regex); } public static List<MatchedFileContent> highlight(String content, String regex, int maxMatchNum, int highlightLineNum) { if (StringUtils.isEmpty(content) || StringUtils.isEmpty(regex)) { return Collections.emptyList(); } content = content.replaceAll("\\r\\n", "\n"); List<String> allLines = Arrays.asList(content.split("\n")); Pattern pattern = Pattern.compile(regex); List<RegexMatchPoint> regexMatchPoints = findAllRegexMatches(allLines, pattern); // 按行号分组,匹配高亮展示,因为单行多个匹配的高亮需要单行展示,分开后合并比较麻烦 Map<Integer, List<RegexMatchPoint>> regexMatchPointMap = regexMatchPoints.stream().collect(Collectors.groupingBy(RegexMatchPoint::getLineNo)); // highLightLineMap: [行号,高亮行] Map<Integer, String> highLightLineMap = new HashMap<>(); regexMatchPointMap.forEach((lineNo, matchPointsOfLine) -> { highLightLineMap.put(lineNo, highlightOneLineContent(allLines.get(lineNo), matchPointsOfLine)); } ); List<MatchedFileContent> partContentList = merge(highLightLineMap, allLines, highlightLineNum); return partContentList.subList(0, Math.min(partContentList.size(), maxMatchNum)); } private static List<MatchedFileContent> merge(Map<Integer, String> highLightLineMap, List<String> allLines, int highlightLineNum) { // 按行号排序 List<Integer> highLightLineNos = Lists.newArrayList(highLightLineMap.keySet()); Collections.sort(highLightLineNos); // 计算需要展示的行号 List<MatchedFileLine> matchedFileLines = Lists.newArrayList(); for (Integer highLineNo: highLightLineNos) { if (!exist(matchedFileLines, highLineNo)) { int startLine = highLineNo - highlightLineNum; int endLine = 0; if (startLine < 0) { startLine = 0; endLine = highLineNo + highlightLineNum; } else { startLine = highLineNo - highlightLineNum + 1; endLine = highLineNo + highlightLineNum; } matchedFileLines.add(new MatchedFileLine(startLine, endLine)); } } return matchedFileLines.stream() .map(fileLine -> getMatchedFileContent(highLightLineMap, allLines, fileLine)).collect(Collectors.toList()); } /** * 获取指定行号的行内容 */ private static String getLine(Map<Integer, String> highLightLineMap, List<String> allLines, Integer lineNo) { String highLightLine = highLightLineMap.get(lineNo); return highLightLine != null ? highLightLine : allLines.get(lineNo); } private static boolean exist(List<MatchedFileLine> matchedFileLines, Integer lineNo) { return matchedFileLines.stream().anyMatch(fileLine -> exist(fileLine, lineNo)); } private static boolean exist(MatchedFileLine matchedFileLine, Integer lineNo) { return lineNo >= matchedFileLine.getStartLine() && lineNo < matchedFileLine.getEndLine(); } /** * 根据起始行号获取 * @param highLightLineMap 高亮行 * @param allLines 文件所有行 * @param fileLine 匹配内容上下文行号 * @return 匹配内容上下文及起始行号 */ private static MatchedFileContent getMatchedFileContent(Map<Integer, String> highLightLineMap, List<String> allLines, MatchedFileLine fileLine) { StringBuilder partContentBuilder = new StringBuilder(); for (int i = fileLine.getStartLine(); i < fileLine.getEndLine() && i < allLines.size(); i++) { partContentBuilder.append(getLine(highLightLineMap, allLines, i) + "\n"); } return new MatchedFileContent(fileLine.getStartLine() + 1, partContentBuilder.toString()); } /** * 获取所有正则匹配点 * @param allLines 文件内容的所有行 * @param pattern 正则匹配编译表达式 * @return 所有匹配正则表达式的字符串的位置 */ private static List<RegexMatchPoint> findAllRegexMatches(List<String> allLines, Pattern pattern) { // 先拿到所有的正则匹配点,行号从 0 开始 List<RegexMatchPoint> regexMatchPoints = Lists.newArrayList(); for (int i=0; i < allLines.size(); i++) { String line = allLines.get(i); Matcher m = pattern.matcher(line); while (m.find()) { RegexMatchPoint regexMatchPoint = new RegexMatchPoint(i, m.start(), m.end()); regexMatchPoints.add(regexMatchPoint); } } return regexMatchPoints; } /** * 高亮文本内容 */ public static String highlightContent(String content, List<String> match) { if (CollectionUtils.isEmpty(match)) { return content; } try { for (String matchContent : match) { String highlightContent = String.format("%s%s%s", CommonValues.HIGH_LIGHT_START, matchContent, CommonValues.HIGH_LIGHT_END); content = content.replaceAll(ExprUtils.escapeExprSpecialWord(matchContent), highlightContent); } } catch (Exception e) { LOG.error("highlight content error, content:{}, match:{}", content, match); } return content; } /** * 高亮一行的展示 */ public static String highlightOneLineContent(String content, List<RegexMatchPoint> points) { int start = 0; int lastMatchEnd = 0; StringBuilder sb = new StringBuilder(); for (RegexMatchPoint point: points) { sb.append(content, start, point.getStart()).append(CommonValues.HIGH_LIGHT_START) .append(content, point.getStart(), point.getEnd()).append(CommonValues.HIGH_LIGHT_END); start = point.getEnd(); lastMatchEnd = point.getEnd(); } sb.append(content.substring(lastMatchEnd)); return sb.toString(); } @Setter @Getter @ToString public static class RegexMatchPoint implements Comparable<RegexMatchPoint> { private Integer lineNo; private Integer start; private Integer end; public RegexMatchPoint(Integer lineNo, Integer start, Integer end) { this.lineNo = lineNo; this.start = start; this.end = end; } public RegexMatchPoint copy() { return new RegexMatchPoint(lineNo, start, end); } } @Setter @Getter public static class MatchedFileLine { private Integer startLine; private Integer endLine; public MatchedFileLine(Integer startLine, Integer endLine) { this.startLine = startLine; this.endLine = endLine; } } }
/** * 高亮展示 * Created by qinshu on 2021/12/31 */ public class HighlightUtilTest { String content = "dependencies {\n" + " testCompile group: 'junit', name: 'junit'\n" + "\n" + " compile project(\":detect-lib\")\n" + " compile project(\":connect-cli\")\n" + " compile project(\":wisteria-client\")\n" + " compile project(\":upload-cli\")\n" + " compile project(\":scan-client\")\n" + " compile(\"com.qt.qt-common:config-loader\")\n" + " compile project(\":switches-lib\")\n" + " compile project(\":bizevent-lib\")\n" + " compile project(\":user-client\")\n" + " compile project(\":notif-client\")\n" + " compile project(\":detect-client\")\n" + " compile project(\":job-cli\")\n" + " compile('com.qt.qt-common:redis-lib')\n" + " compile('com.qt.qt-common:rabbitmq-lib')\n" + " compile('com.qt.qt-common:encrypt-property-lib')\n" + " compile project(\":leader-latch-lib\")\n" + " compile(\"com.qt.qt-common:eventflow-lib:1.0.0-SNAPSHOT\")\n" + " compile(\"com.qt.qt-common:intrusion-detect-lib:1.0.1\")\n" + " compile('com.qt.qt-common:mysql-lib')\n" + " compile('com.qt.qt-common:rule-crypto')\n" + " compile project(\":rule-lib\")\n" + " compile project(\":api-auth-lib\")\n" + "\n" + " // Spring Cloud\n" + " // 配置中心\n" + " compile ('org.springframework.cloud:spring-cloud-starter-zookeeper-config')\n" + " // 服务发现\n" + " compile ('org.springframework.cloud:spring-cloud-starter-zookeeper-discovery')\n" + " compile ('com.netflix.hystrix:hystrix-javanica')\n" + "\n" + " // Spring Boot\n" + " compile('org.springframework.boot:spring-boot-starter-web')\n" + " compile('org.springframework.boot:spring-boot-starter-aop')\n" + " compile('org.springframework.boot:spring-boot-starter-data-redis')\n" + "\n" + " // Spring\n" + " compile('org.springframework:spring-orm')\n" + " compile('org.springframework:spring-jdbc')\n" + " compile('org.springframework:spring-aop')\n" + "\n" + " // mongodb\n" + " compile('org.springframework.data:spring-data-mongodb:1.10.23.RELEASE')\n" + "\n" + " // Mysql\n" + " runtime('mysql:mysql-connector-java')\n" + " compile('com.zaxxer:HikariCP')\n" + " compile('org.mybatis.spring.boot:mybatis-spring-boot-starter')\n" + " compile('com.github.pagehelper:pagehelper-spring-boot-starter')\n" + "\n" + " //redisson\n" + " compile('io.projectreactor:reactor-core:3.2.8.RELEASE')\n" + "\n" + " // Jackson\n" + " compile('com.fasterxml.jackson.core:jackson-core')\n" + " compile('com.fasterxml.jackson.core:jackson-annotations')\n" + " compile('com.fasterxml.jackson.core:jackson-databind')\n" + " compile('org.codehaus.jackson:jackson-core-asl')\n" + "\n" + " compile('joda-time:joda-time')\n" + " compile('commons-io:commons-io:2.5')\n" + " compile('org.apache.commons:commons-lang3:3.5')\n" + " compile('org.apache.commons:commons-collections4:4.1')\n" + " compile('cglib:cglib:3.2.5')\n" + " compile('net.java.dev.jna:jna:5.8.0')\n" + " compile('org.apache.calcite:calcite-core:1.26.0')\n" + "\n" + " // Test\n" + " testCompile('org.mockito:mockito-core:2.13.0')\n" + " testCompile('org.springframework:spring-test')\n" + " testCompile('org.springframework.boot:spring-boot-starter-test')\n" + "\n" + " // string-similarity\n" + " compile('info.debatty:java-string-similarity:0.24')\n" + "\n" + " compile('com.jayway.jsonpath:json-path')\n" + "\n" + " compile('com.qt.qt-common:cron-lib:1.0.0')\n" + "\n" + "}"; @Test public void tsetHighlight() { String regex = "org\\.apache"; List<MatchedFileContent> matched = HighLightUtil.highlight(content, regex); Assert.assertTrue(matched.size() > 0); } @Test public void testHighlightBase64() { String content = "MG1laW5hMiAxbWVpbmEyCjBtZWluYTIgMW1laW5hMgo="; String regex = "meina2"; List<MatchedFileContent> matchedFileContents = HighLightUtil.highlightBase64(content, regex); Assert.assertEquals(1, matchedFileContents.size()); Assert.assertEquals("[MatchedFileContent(startLine=1, partContent=0<qthighlight--meina2--qthighlight> 1<qthighlight--meina2--qthighlight>\n" + "0<qthighlight--meina2--qthighlight> 1<qthighlight--meina2--qthighlight>\n" + ")]", matchedFileContents.toString()); } @Test public void testHighLight2() { String content = "customdir2 1\n" + "customdir2 2\n" + "customdir2 3\n" + "customdir2 4\n" + "customdir2 5\n" + "customdir2 6\n" + "customdir2 7 customdir2 7 customdir2 7 customdir2 7\n" + "customdir2 8 customdir2 8 customdir2 8 customdir2 8"; String regex = "customdir2"; List<MatchedFileContent> matchedFileContents = HighLightUtil.highlight(content, regex); Assert.assertEquals(2, matchedFileContents.size()); } @Test public void testHighLight3() { String content = "customdir2 1\n" + "customdir2 2\n" + "customdir3 3\n" + "customdir5 4\n" + "customdir6 5\n" + "customdir9 6\n" + "customdir10 7 customdir8 7 customdird 7 customdiro 7\n" + "customdir2 8 customdir2 8 customdir2 8 customdir2 8"; String regex = "customdir2"; List<MatchedFileContent> matchedFileContents = HighLightUtil.highlight(content, regex); Assert.assertEquals(2, matchedFileContents.size()); } }
本文讲解了如何运用“构造与使用分离”的思想,来重构和改进高亮展示命中文本内容的算法实现。 构造与使用分离,即是在构造的时候抽取所需的必要信息,而在使用的时候去构建所需要功能,而不是边构建边使用,将构建与使用耦合在一起,后续如果有需求变更,改动就会比较麻烦。