系统: windows 11
jdk: 1.8
框架: spring boot 2.1.4 + maven
工具: idea
官方网站:https://github.com/tesseract-ocr/tesseract
官方文档:https://github.com/tesseract-ocr/tessdoc
语言包地址:https://github.com/tesseract-ocr/tessdata
下载地址:https://digi.bib.uni-mannheim.de/tesseract/
在我的资源里面也提交了5.0 windows 的安装包.(我想免费的,官方没有这个选择)
64位: https://download.csdn.net/download/qq_35885175/68236012
32位: https://download.csdn.net/download/qq_35885175/68235937
双击 选择语言点击OK
直接next(下一步)
日常同意协议
选择给谁安装(看自己需要,我的是个人电脑)
接下来是选择语言包
最后那两个框就是下载语言包的不建议选择(很慢,除非梯子,下面会有教程扩展语言包的)
选择安装目录
next,等待安装完成
(有一个选择是否添加注册表的,我忘记截图了,默认next 干就完了)
安装完成(结束)
这个步骤对于大家来说就很简单了
万能的cmd来了
上面那个链接可以跳转github里面下载(传送门)
csdn我上传的:https://download.csdn.net/download/qq_35885175/68245426
这个东西就是简体中文的啦
go:
把下载好的文件放入安装目录中,就完成了
测试时候扩展是否成功
小黑框(cmd) 输入: tesseract --list-langs
搞定
<dependency> <groupId>net.sourceforge.tess4j</groupId> <artifactId>tess4j</artifactId> <version>5.0.0</version> </dependency>
下载不成功的话 多切换几个 version
jar包链接 : https://download.csdn.net/download/qq_35885175/68256292
或者下载 这个我上传的 直接按照目录扔到仓库
package com.lk.integutils.utils; import java.awt.image.BufferedImage; import java.io.File; import java.io.IOException; import java.util.Iterator; import java.util.Locale; import javax.imageio.IIOImage; import javax.imageio.ImageIO; import javax.imageio.ImageReader; import javax.imageio.ImageWriteParam; import javax.imageio.ImageWriter; import javax.imageio.metadata.IIOMetadata; import javax.imageio.stream.ImageInputStream; import javax.imageio.stream.ImageOutputStream; import com.github.jaiimageio.plugins.tiff.TIFFImageWriteParam; public class ImageIOHelper { //设置语言 private Locale locale=Locale.CHINESE; //自定义语言构造的方法 public ImageIOHelper(Locale locale){ this.locale=locale; } //默认构造器Locale.CHINESE public ImageIOHelper(){ } /** * 创建临时图片文件防止损坏初始文件 * @param imageFile * @param imageFormat like png,jps .etc * @return TempFile of Image */ public File createImage(File imageFile, String imageFormat) throws IOException { //读取图片文件 Iterator<ImageReader> readers = ImageIO.getImageReadersByFormatName(imageFormat); ImageReader reader = readers.next(); //获取文件流 ImageInputStream iis = ImageIO.createImageInputStream(imageFile); reader.setInput(iis); IIOMetadata streamMetadata = reader.getStreamMetadata(); //设置writeParam TIFFImageWriteParam tiffWriteParam = new TIFFImageWriteParam(Locale.CHINESE); tiffWriteParam.setCompressionMode(ImageWriteParam.MODE_DISABLED); //设置可否压缩 //获得tiffWriter和设置output Iterator<ImageWriter> writers = ImageIO.getImageWritersByFormatName("tiff"); ImageWriter writer = writers.next(); BufferedImage bi = reader.read(0); IIOImage image = new IIOImage(bi,null,reader.getImageMetadata(0)); File tempFile = tempImageFile(imageFile); ImageOutputStream ios = ImageIO.createImageOutputStream(tempFile); writer.setOutput(ios); writer.write(streamMetadata, image, tiffWriteParam); ios.close(); iis.close(); writer.dispose(); reader.dispose(); return tempFile; } /** * 给tempfile添加后缀 * @param imageFile * @throws IOException */ private File tempImageFile(File imageFile) throws IOException { String path = imageFile.getPath(); StringBuffer strB = new StringBuffer(path); strB.insert(path.lastIndexOf('.'),"_text_recognize_temp"); String s=strB.toString().replaceFirst("(?<=//.)(//w+)$", "tif"); //设置文件隐藏 Runtime.getRuntime().exec("attrib "+"\""+s+"\""+" +H"); return new File(strB.toString()); } }
package com.lk.integutils.utils; import java.io.*; import java.util.ArrayList; import java.util.List; import java.util.Locale; import org.jdesktop.swingx.util.OS; public class OCRUtil { private final String LANG_OPTION = "-l"; //英文字母小写l,并非阿拉伯数字1 private final String EOL = System.getProperty("line.separator"); /** * ocr的安装路径 */ private String tessPath = "D:\\Utils\\Tesseract-OCR"; public OCRUtil(String tessPath,String transFileName){ this.tessPath=tessPath; } //OCRUtil的构造方法,默认路径是"C://Program Files (x86)//Tesseract-OCR" public OCRUtil(){ } public String getTessPath() { return tessPath; } public void setTessPath(String tessPath) { this.tessPath = tessPath; } public String getLANG_OPTION() { return LANG_OPTION; } public String getEOL() { return EOL; } /** * @return 识别后的文字 */ public String recognizeText(File imageFile,String imageFormat)throws Exception{ File tempImage = new ImageIOHelper().createImage(imageFile,imageFormat); return ocrImages(tempImage, imageFile); } /** * 可以自定义语言 */ public String recognizeText(File imageFile,String imageFormat,Locale locale)throws Exception{ File tempImage = new ImageIOHelper(locale).createImage(imageFile,imageFormat); return ocrImages(tempImage, imageFile); } /** * @param * @param * @return 识别后的内容 * @throws IOException * @throws InterruptedException */ private String ocrImages(File tempImage,File imageFile) throws IOException, InterruptedException{ //设置输出文件的保存的文件目录,以及文件名 File outputFile = new File(imageFile.getParentFile(),"test"); StringBuffer strB = new StringBuffer(); //设置命令行内容 List<String> cmd = new ArrayList<String>(); if(OS.isWindowsXP()){ cmd.add(tessPath+"//tesseract"); }else if(OS.isLinux()){ cmd.add("tesseract"); }else{ cmd.add(tessPath+"//tesseract"); } cmd.add(""); cmd.add(outputFile.getName()); cmd.add(LANG_OPTION); //中文包 cmd.add("chi_sim"); //常用数学公式包 cmd.add("equ"); //英语包 cmd.add("eng"); //创建操作系统进程 ProcessBuilder pb = new ProcessBuilder(); //设置此进程生成器的工作目录 pb.directory(imageFile.getParentFile()); cmd.set(1, tempImage.getName()); //设置要执行的cmd命令 pb.command(cmd); //设置后续子进程生成的错误输出都将与标准输出合并 pb.redirectErrorStream(true); long startTime = System.currentTimeMillis(); System.out.println("开始时间:" + startTime); //开始执行,并返回进程实例 Process process = pb.start(); //最终执行命令为:tesseract 1.png test -l chi_sim+equ+eng // 输入输出流优化 // printMessage(process.getInputStream()); // printMessage(process.getErrorStream()); int w = process.waitFor(); //删除临时正在工作文件 tempImage.delete(); if(w==0){ // 0代表正常退出 BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(outputFile.getAbsolutePath()+".txt"),"UTF-8")); String str; while((str = in.readLine())!=null){ strB.append(str).append(EOL); } in.close(); long endTime = System.currentTimeMillis(); System.out.println("结束时间:" + endTime); System.out.println("耗时:" + (endTime - startTime) + "毫秒"); }else{ String msg; switch(w){ case 1: msg = "Errors accessing files.There may be spaces in your image's filename."; break; case 29: msg = "Cannot recongnize the image or its selected region."; break; case 31: msg = "Unsupported image format."; break; default: msg = "Errors occurred."; } tempImage.delete(); throw new RuntimeException(msg); } // 删除提取到文字的临时文件 new File(outputFile.getAbsolutePath()+".txt").delete(); return strB.toString().replaceAll("\\s*", ""); } private static void printMessage(final InputStream input) { new Thread(new Runnable() { @Override public void run() { Reader reader = new InputStreamReader(input); BufferedReader bf = new BufferedReader(reader); String line = null; try { while ((line = bf.readLine()) != null) { System.out.println(line); } } catch (IOException e) { e.printStackTrace(); } } }).start(); } }
package com.lk.integutils.utils; import java.io.File; import java.io.IOException; public class TestOcr { /** * @param args */ public static void main(String[] args) { //输入图片地址 String path = "d://1640144341(1).png"; try { String valCode = new OCRUtil().recognizeText(new File(path), "png"); System.out.println(valCode); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } } }
效果不咋的,需要调教
mall4j: https://segmentfault.com/a/1190000039362377
SyKay: https://www.jianshu.com/p/f7cb0b3f337a