I can able to get the proper OCR output using newly trained tessedata (version 3.02) through command prompt but I want same output in C# code with DLL ref.I have tried with tessnet2_32.dll reference but It is throwing exception so How to use or access the tesseract 3.02 version trained tessedata using DLL reference through C# code?
可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效,请关闭广告屏蔽插件后再试):
问题:
回答1:
That was for Tesseract 2.04. You'll need a .NET wrapper compatible with 3.02 version.
回答2:
To access or use tesseract 3.02 trained data we have to create separate wrapper class like below.
using System;
using System.IO;
using System.Diagnostics;
using System.Drawing;
/// <summary>
/// Summary description for TesseractOCR
/// </summary>
///
namespace tesseractThree
{
public class TesseractOCR
{
public TesseractOCR()
{
//
// TODO: Add constructor logic here
//
}
private string commandpath;
private string outpath;
private string tmppath;
public TesseractOCR(string commandpath)
{
this.commandpath = commandpath;
tmppath = System.Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData) + @"\out.tif";
outpath = System.Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData) + @"\out.txt";
}
public string analyze(string filename,string lang,bool noLine)
{
string args = filename + " " + outpath.Replace(".txt", "");
ProcessStartInfo startinfo;
if (noLine == true)
{
startinfo = new ProcessStartInfo(commandpath, args + " -l " + lang + " -psm 6");
}
else
{
startinfo = new ProcessStartInfo(commandpath, args + " -l " + lang);
}
startinfo.CreateNoWindow = true;
startinfo.UseShellExecute = false;
Process.Start(startinfo).WaitForExit();
string ret = "";
using (StreamReader r = new StreamReader(outpath))
{
string content = r.ReadToEnd();
ret = content;
}
File.Delete(outpath);
return ret;
}
public string OCRFromBitmap(Bitmap bmp,string lang,bool noLine)
{
bmp.Save(tmppath, System.Drawing.Imaging.ImageFormat.Tiff);
string ret = analyze(tmppath,lang,noLine);
File.Delete(tmppath);
return ret;
}
/* public string OCRFromFile(string filename)
{
return analyze(filename);
}*/
}
}
//Usage of this class
string lang = "enc";
Bitmap b = new Bitmap(@"D:\Image\enc.test_font.exp0.tif");
TesseractOCR ocr = new TesseractOCR(@"C:\Program Files\Tesseract-OCR\tesseract.exe");
string result = ocr.OCRFromBitmap(b, lang,true);
Label1.Text = result;
OR Refer below link for more details.
https://gist.github.com/yatt/915443
回答3:
Using the tesseractengine3.dll we can use tesseract v3.02 trained data like below.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using tesseract;
using System.Drawing;
using System.IO;
public enum TesseractEngineMode : int
{
/// <summary>
/// Run Tesseract only - fastest
/// </summary>
TESSERACT_ONLY = 0,
/// <summary>
/// Run Cube only - better accuracy, but slower
/// </summary>
CUBE_ONLY = 1,
/// <summary>
/// Run both and combine results - best accuracy
/// </summary>
TESSERACT_CUBE_COMBINED = 2,
/// <summary>
/// Specify this mode when calling init_*(),
/// to indicate that any of the above modes
/// should be automatically inferred from the
/// variables in the language-specific config,
/// command-line configs, or if not specified
/// in any of the above should be set to the
/// default OEM_TESSERACT_ONLY.
/// </summary>
DEFAULT = 3
}
public enum TesseractPageSegMode : int
{
/// <summary>
/// Fully automatic page segmentation
/// </summary>
PSM_AUTO = 0,
/// <summary>
/// Assume a single column of text of variable sizes
/// </summary>
PSM_SINGLE_COLUMN = 1,
/// <summary>
/// Assume a single uniform block of text (Default)
/// </summary>
PSM_SINGLE_BLOCK = 2,
/// <summary>
/// Treat the image as a single text line
/// </summary>
PSM_SINGLE_LINE = 3,
/// <summary>
/// Treat the image as a single word
/// </summary>
PSM_SINGLE_WORD = 4,
/// <summary>
/// Treat the image as a single character
/// </summary>
PSM_SINGLE_CHAR = 5
}
public partial class importDLL : System.Web.UI.Page
{
private TesseractProcessor m_tesseract = null;
//private const string m_path = @"..\..\data\";
private const string m_path = @"D:\tessdata-3.02\";
private const string m_lang = "eng";
protected void Page_Load(object sender, EventArgs e)
{
var image = System.Drawing.Image.FromFile(@"D:\Image\Capture1T.tif");
m_tesseract = new TesseractProcessor();
bool succeed = m_tesseract.Init(m_path, m_lang, (int)TesseractEngineMode.DEFAULT);
if (!succeed)
{
}
m_tesseract.SetVariable("tessedit_pageseg_mode", ((int)TesseractPageSegMode.PSM_SINGLE_LINE).ToString());
m_tesseract.Clear();
m_tesseract.ClearAdaptiveClassifier();
string outValue= m_tesseract.Apply(image);
Response.Write(outValue);
}
}