-
-
Notifications
You must be signed in to change notification settings - Fork 24
Expand file tree
/
Copy pathEncDetector.cs
More file actions
94 lines (77 loc) · 3.13 KB
/
EncDetector.cs
File metadata and controls
94 lines (77 loc) · 3.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
/*!
* Copyright (c) 2013 Denis Kuzmin <x-3F@outlook.com> github/3F
* Copyright (c) vsSolutionBuildEvent contributors https://github.com/3F/vsSolutionBuildEvent/graphs/contributors
* Licensed under the LGPLv3.
* See accompanying LICENSE file or visit https://github.com/3F/vsSolutionBuildEvent
*/
using System;
using System.IO;
using System.Text;
using net.r_eg.SobaScript.Z.Ext;
namespace net.r_eg.vsSBE
{
internal sealed class EncDetector: IEncDetector
{
/// <summary>
/// Detects encoding for specified stream.
/// </summary>
/// <param name="stream">Input stream.</param>
/// <param name="confidence">Detected confidence.</param>
/// <returns>null if can't be detected.</returns>
public Encoding Detect(Stream stream, out float confidence)
{
confidence = 0;
if(stream == null) {
return null;
}
Ude.CharsetDetector cdet = new Ude.CharsetDetector();
cdet.Feed(stream);
cdet.DataEnd();
if(cdet.Charset == null) {
return null;
}
confidence = cdet.Confidence;
Log.Debug($"Detected charset '{cdet.Charset}' confidence: '{cdet.Confidence}'");
return Encoding.GetEncoding(cdet.Charset);
}
/// <summary>
/// Detects encoding for specified stream.
/// </summary>
/// <param name="stream">Input stream.</param>
/// <returns>null if can't be detected.</returns>
public Encoding Detect(Stream stream)
=> Detect(stream, out float confidence);
/// <summary>
/// Try to fix the wrong encoded string.
/// </summary>
/// <param name="input">Input data.</param>
/// <param name="container">Known information about bytes.</param>
/// <param name="confidence">To limit accepted confidence.</param>
/// <returns>Returns null if detected confidence less than input limit. Otherwise, re-encoded string.</returns>
public string FixEncoding(string input, Encoding container, float confidence = 0.92f)
{
if(string.IsNullOrWhiteSpace(input)) {
return input;
}
if(container == null) {
throw new ArgumentNullException(nameof(container));
}
byte[] bytes = container.GetBytes(input);
var cdet = new Ude.CharsetDetector();
cdet.Feed(bytes, 0, bytes.Length);
cdet.DataEnd();
if(cdet.Charset == null) {
return null;
}
Log.Debug($"{nameof(FixEncoding)}: charset '{cdet.Charset}' confidence: '{cdet.Confidence}'");
if(cdet.Confidence < confidence) {
Log.Debug($"{nameof(FixEncoding)}: Confidence < {confidence}");
return null;
}
Encoding to = Encoding.GetEncoding(cdet.Charset);
Log.Debug($"ReEncodeString: '{container.EncodingName}' -> '{to.EncodingName}'");
Log.Trace($"ReEncodeString: original - '{input}'");
return to.GetString(bytes);
}
}
}