Skip to content

Commit a9b9534

Browse files
authored
Merge pull request microsoft#21416 from katainaka0503/auto-detect-encoding
Auto guess encoding
2 parents 06492ab + 7915b89 commit a9b9534

14 files changed

Lines changed: 160 additions & 59 deletions

File tree

npm-shrinkwrap.json

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
"http-proxy-agent": "0.2.7",
3030
"https-proxy-agent": "0.3.6",
3131
"iconv-lite": "0.4.15",
32+
"jschardet": "^1.4.2",
3233
"minimist": "1.2.0",
3334
"native-keymap": "1.2.1",
3435
"node-pty": "0.6.2",

src/typings/jschardet.d.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
declare module 'jschardet' {
2+
export interface IDetectedMap {
3+
encoding: string,
4+
confidence: number
5+
}
6+
export function detect(buffer: NodeBuffer): IDetectedMap;
7+
8+
export const Constants: {
9+
MINIMUM_THRESHOLD: number,
10+
}
11+
}

src/vs/base/node/encoding.ts

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import stream = require('vs/base/node/stream');
99
import iconv = require('iconv-lite');
1010
import { TPromise } from 'vs/base/common/winjs.base';
11+
import jschardet = require('jschardet');
1112

1213
export const UTF8 = 'utf8';
1314
export const UTF8_with_bom = 'utf8bom';
@@ -91,7 +92,36 @@ export function detectEncodingByBOMFromBuffer(buffer: NodeBuffer, bytesRead: num
9192
* If no BOM is detected, null will be passed to callback.
9293
*/
9394
export function detectEncodingByBOM(file: string): TPromise<string> {
94-
return stream.readExactlyByFile(file, 3).then(({buffer, bytesRead}) => detectEncodingByBOMFromBuffer(buffer, bytesRead));
95+
return stream.readExactlyByFile(file, 3).then(({ buffer, bytesRead }) => detectEncodingByBOMFromBuffer(buffer, bytesRead));
96+
}
97+
98+
const MINIMUM_THRESHOLD = 0.2; // TODO@Ben Decide how much this should be.
99+
jschardet.Constants.MINIMUM_THRESHOLD = MINIMUM_THRESHOLD;
100+
101+
const IGNORE_ENCODINGS = ['ascii', 'utf-8', 'utf-16', 'utf-32'];
102+
103+
/**
104+
* Guesses the encoding from buffer.
105+
*/
106+
export function guessEncodingByBuffer(buffer: NodeBuffer): string {
107+
const guessed = jschardet.detect(buffer);
108+
if (!guessed || !guessed.encoding) {
109+
return null;
110+
}
111+
112+
const enc = guessed.encoding.toLowerCase();
113+
114+
// Ignore encodings that cannot guess correctly
115+
// (http://chardet.readthedocs.io/en/latest/supported-encodings.html)
116+
if (0 <= IGNORE_ENCODINGS.indexOf(enc)) {
117+
return null;
118+
}
119+
120+
return lowerCaseWithoutNonAlphaNumeric(guessed.encoding);
121+
}
122+
123+
function lowerCaseWithoutNonAlphaNumeric(encodingName: string): string {
124+
return encodingName.replace(/[^a-zA-Z0-9]/g, '').toLowerCase();
95125
}
96126

97127
/**

src/vs/base/node/mime.ts

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -58,15 +58,23 @@ export interface IMimeAndEncoding {
5858
mimes: string[];
5959
}
6060

61-
function doDetectMimesFromStream(instream: streams.Readable): TPromise<IMimeAndEncoding> {
62-
return stream.readExactlyByStream(instream, BUFFER_READ_MAX_LEN).then(detectMimeAndEncodingFromBuffer);
61+
export interface DetectMimesOption {
62+
autoGuessEncoding?: boolean;
6363
}
6464

65-
function doDetectMimesFromFile(absolutePath: string): TPromise<IMimeAndEncoding> {
66-
return stream.readExactlyByFile(absolutePath, BUFFER_READ_MAX_LEN).then(detectMimeAndEncodingFromBuffer);
65+
function doDetectMimesFromStream(instream: streams.Readable, option?: DetectMimesOption): TPromise<IMimeAndEncoding> {
66+
return stream.readExactlyByStream(instream, BUFFER_READ_MAX_LEN).then((readResult: stream.ReadResult) => {
67+
return detectMimeAndEncodingFromBuffer(readResult, option && option.autoGuessEncoding);
68+
});
6769
}
6870

69-
export function detectMimeAndEncodingFromBuffer({buffer, bytesRead}: stream.ReadResult): IMimeAndEncoding {
71+
function doDetectMimesFromFile(absolutePath: string, option?: DetectMimesOption): TPromise<IMimeAndEncoding> {
72+
return stream.readExactlyByFile(absolutePath, BUFFER_READ_MAX_LEN).then((readResult: stream.ReadResult) => {
73+
return detectMimeAndEncodingFromBuffer(readResult, option && option.autoGuessEncoding);
74+
});
75+
}
76+
77+
export function detectMimeAndEncodingFromBuffer({ buffer, bytesRead }: stream.ReadResult, autoGuessEncoding?: boolean): IMimeAndEncoding {
7078
let enc = encoding.detectEncodingByBOMFromBuffer(buffer, bytesRead);
7179

7280
// Detect 0 bytes to see if file is binary (ignore for UTF 16 though)
@@ -79,6 +87,9 @@ export function detectMimeAndEncodingFromBuffer({buffer, bytesRead}: stream.Read
7987
}
8088
}
8189
}
90+
if (autoGuessEncoding && isText && !enc) {
91+
enc = encoding.guessEncodingByBuffer(buffer);
92+
}
8293

8394
return {
8495
mimes: isText ? [mime.MIME_TEXT] : [mime.MIME_BINARY],
@@ -116,8 +127,8 @@ function filterAndSortMimes(detectedMimes: string[], guessedMimes: string[]): st
116127
* @param instream the readable stream to detect the mime types from.
117128
* @param nameHint an additional hint that can be used to detect a mime from a file extension.
118129
*/
119-
export function detectMimesFromStream(instream: streams.Readable, nameHint: string): TPromise<IMimeAndEncoding> {
120-
return doDetectMimesFromStream(instream).then(encoding =>
130+
export function detectMimesFromStream(instream: streams.Readable, nameHint: string, option?: DetectMimesOption): TPromise<IMimeAndEncoding> {
131+
return doDetectMimesFromStream(instream, option).then(encoding =>
121132
handleMimeResult(nameHint, encoding)
122133
);
123134
}
@@ -126,8 +137,8 @@ export function detectMimesFromStream(instream: streams.Readable, nameHint: stri
126137
* Opens the given file to detect its mime type. Returns an array of mime types sorted from most specific to unspecific.
127138
* @param absolutePath the absolute path of the file.
128139
*/
129-
export function detectMimesFromFile(absolutePath: string): TPromise<IMimeAndEncoding> {
130-
return doDetectMimesFromFile(absolutePath).then(encoding =>
140+
export function detectMimesFromFile(absolutePath: string, option?: DetectMimesOption): TPromise<IMimeAndEncoding> {
141+
return doDetectMimesFromFile(absolutePath, option).then(encoding =>
131142
handleMimeResult(absolutePath, encoding)
132143
);
133144
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
VSCODE�͍ō��̃G�f�B�^���B

src/vs/base/test/node/mime/mime.test.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,4 +60,12 @@ suite('Mime', () => {
6060
done();
6161
}, done);
6262
});
63+
64+
test('autoGuessEncoding (ShiftJIS)', function (done: () => void) {
65+
const file = require.toUrl('./fixtures/some.shiftjis.txt');
66+
mime.detectMimesFromFile(file, { autoGuessEncoding: true }).then(mimes => {
67+
assert.equal(mimes.encoding, 'shiftjis');
68+
done();
69+
}, done);
70+
});
6371
});

src/vs/platform/files/common/files.ts

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,11 @@ export interface IResolveContentOptions {
495495
* the contents of the file.
496496
*/
497497
encoding?: string;
498+
499+
/**
500+
* The optional guessEncoding parameter allows to guess encoding from content of the file.
501+
*/
502+
autoGuessEncoding?: boolean;
498503
}
499504

500505
export interface IUpdateContentOptions {
@@ -575,6 +580,7 @@ export interface IFilesConfiguration {
575580
exclude: glob.IExpression;
576581
watcherExclude: { [filepattern: string]: boolean };
577582
encoding: string;
583+
autoGuessEncoding: boolean;
578584
defaultLanguage: string;
579585
trimTrailingWhitespace: boolean;
580586
autoSave: string;
@@ -798,17 +804,17 @@ export const SUPPORTED_ENCODINGS: { [encoding: string]: { labelLong: string; lab
798804
labelShort: 'ISO 8859-11',
799805
order: 42
800806
},
801-
'koi8-ru': {
807+
koi8ru: {
802808
labelLong: 'Cyrillic (KOI8-RU)',
803809
labelShort: 'KOI8-RU',
804810
order: 43
805811
},
806-
'koi8-t': {
812+
koi8t: {
807813
labelLong: 'Tajik (KOI8-T)',
808814
labelShort: 'KOI8-T',
809815
order: 44
810816
},
811-
GB2312: {
817+
gb2312: {
812818
labelLong: 'Simplified Chinese (GB 2312)',
813819
labelShort: 'GB 2312',
814820
order: 45

src/vs/platform/telemetry/common/telemetryUtils.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ const configurationValueWhitelist = [
240240
'editor.acceptSuggestionOnCommitCharacter',
241241
'workbench.editor.showTabs',
242242
'files.encoding',
243+
'files.autoGuessEncoding',
243244
'editor.quickSuggestionsDelay',
244245
'editor.snippetSuggestions',
245246
'editor.selectionHighlight',

src/vs/workbench/browser/parts/editor/editorStatus.ts

Lines changed: 62 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ import { IEditor as IBaseEditor, IEditorInput } from 'vs/platform/editor/common/
3434
import { IWorkbenchEditorService } from 'vs/workbench/services/editor/common/editorService';
3535
import { IQuickOpenService, IPickOpenEntry, IFilePickOpenEntry } from 'vs/platform/quickOpen/common/quickOpen';
3636
import { IWorkspaceConfigurationService } from 'vs/workbench/services/configuration/common/configuration';
37-
import { IFilesConfiguration, SUPPORTED_ENCODINGS } from 'vs/platform/files/common/files';
37+
import { IFilesConfiguration, SUPPORTED_ENCODINGS, IFileService } from 'vs/platform/files/common/files';
3838
import { IInstantiationService } from 'vs/platform/instantiation/common/instantiation';
3939
import { IModeService } from 'vs/editor/common/services/modeService';
4040
import { IModelService } from 'vs/editor/common/services/modelService';
@@ -1031,7 +1031,8 @@ export class ChangeEncodingAction extends Action {
10311031
actionLabel: string,
10321032
@IWorkbenchEditorService private editorService: IWorkbenchEditorService,
10331033
@IQuickOpenService private quickOpenService: IQuickOpenService,
1034-
@IWorkspaceConfigurationService private configurationService: IWorkspaceConfigurationService
1034+
@IWorkspaceConfigurationService private configurationService: IWorkspaceConfigurationService,
1035+
@IFileService private fileService: IFileService
10351036
) {
10361037
super(actionId, actionLabel);
10371038
}
@@ -1072,51 +1073,69 @@ export class ChangeEncodingAction extends Action {
10721073
return undefined;
10731074
}
10741075

1075-
return TPromise.timeout(50 /* quick open is sensitive to being opened so soon after another */).then(() => {
1076-
const configuration = this.configurationService.getConfiguration<IFilesConfiguration>();
1077-
1078-
const isReopenWithEncoding = (action === reopenWithEncodingPick);
1079-
const configuredEncoding = configuration && configuration.files && configuration.files.encoding;
1080-
let directMatchIndex: number;
1081-
let aliasMatchIndex: number;
1082-
1083-
// All encodings are valid picks
1084-
const picks: IPickOpenEntry[] = Object.keys(SUPPORTED_ENCODINGS)
1085-
.sort((k1, k2) => {
1086-
if (k1 === configuredEncoding) {
1087-
return -1;
1088-
} else if (k2 === configuredEncoding) {
1089-
return 1;
1090-
}
1091-
1092-
return SUPPORTED_ENCODINGS[k1].order - SUPPORTED_ENCODINGS[k2].order;
1093-
})
1094-
.filter(k => {
1095-
return !isReopenWithEncoding || !SUPPORTED_ENCODINGS[k].encodeOnly; // hide those that can only be used for encoding if we are about to decode
1096-
})
1097-
.map((key, index) => {
1098-
if (key === encodingSupport.getEncoding()) {
1099-
directMatchIndex = index;
1100-
} else if (SUPPORTED_ENCODINGS[key].alias === encodingSupport.getEncoding()) {
1101-
aliasMatchIndex = index;
1102-
}
1076+
const guessEncoding = () => {
1077+
const resource = toResource(activeEditor.input);
1078+
return this.fileService.resolveContent(resource, { autoGuessEncoding: true, acceptTextOnly: true })
1079+
.then(content => content.encoding, err => null);
1080+
};
11031081

1104-
return { id: key, label: SUPPORTED_ENCODINGS[key].labelLong };
1105-
});
1082+
return TPromise.timeout(50 /* quick open is sensitive to being opened so soon after another */)
1083+
.then(guessEncoding)
1084+
.then(guessedEncoding => {
1085+
const configuration = this.configurationService.getConfiguration<IFilesConfiguration>();
1086+
1087+
const isReopenWithEncoding = (action === reopenWithEncodingPick);
1088+
const configuredEncoding = configuration && configuration.files && configuration.files.encoding;
1089+
let directMatchIndex: number;
1090+
let aliasMatchIndex: number;
1091+
1092+
// All encodings are valid picks
1093+
const picks: IPickOpenEntry[] = Object.keys(SUPPORTED_ENCODINGS)
1094+
.sort((k1, k2) => {
1095+
if (k1 === configuredEncoding) {
1096+
return -1;
1097+
} else if (k2 === configuredEncoding) {
1098+
return 1;
1099+
}
1100+
1101+
return SUPPORTED_ENCODINGS[k1].order - SUPPORTED_ENCODINGS[k2].order;
1102+
})
1103+
.filter(k => {
1104+
if (k === guessedEncoding && guessedEncoding !== configuredEncoding) {
1105+
return false; // do not show encoding if it is the guessed encoding that does not match the configured
1106+
}
1107+
1108+
return !isReopenWithEncoding || !SUPPORTED_ENCODINGS[k].encodeOnly; // hide those that can only be used for encoding if we are about to decode
1109+
})
1110+
.map((key, index) => {
1111+
if (key === encodingSupport.getEncoding()) {
1112+
directMatchIndex = index;
1113+
} else if (SUPPORTED_ENCODINGS[key].alias === encodingSupport.getEncoding()) {
1114+
aliasMatchIndex = index;
1115+
}
1116+
1117+
return { id: key, label: SUPPORTED_ENCODINGS[key].labelLong };
1118+
});
1119+
1120+
// If we have a guessed encoding, show it first unless it matches the configured encoding
1121+
if (guessedEncoding && configuredEncoding !== guessedEncoding && SUPPORTED_ENCODINGS[guessedEncoding]) {
1122+
picks[0].separator = { border: true };
1123+
picks.unshift({ id: guessedEncoding, label: SUPPORTED_ENCODINGS[guessedEncoding].labelLong, description: nls.localize('guessedEncoding', "Guessed from content") });
1124+
}
11061125

1107-
return this.quickOpenService.pick(picks, {
1108-
placeHolder: isReopenWithEncoding ? nls.localize('pickEncodingForReopen', "Select File Encoding to Reopen File") : nls.localize('pickEncodingForSave', "Select File Encoding to Save with"),
1109-
autoFocus: { autoFocusIndex: typeof directMatchIndex === 'number' ? directMatchIndex : typeof aliasMatchIndex === 'number' ? aliasMatchIndex : void 0 }
1110-
}).then(encoding => {
1111-
if (encoding) {
1112-
activeEditor = this.editorService.getActiveEditor();
1113-
encodingSupport = toEditorWithEncodingSupport(activeEditor.input);
1114-
if (encodingSupport && encodingSupport.getEncoding() !== encoding.id) {
1115-
encodingSupport.setEncoding(encoding.id, isReopenWithEncoding ? EncodingMode.Decode : EncodingMode.Encode); // Set new encoding
1126+
return this.quickOpenService.pick(picks, {
1127+
placeHolder: isReopenWithEncoding ? nls.localize('pickEncodingForReopen', "Select File Encoding to Reopen File") : nls.localize('pickEncodingForSave', "Select File Encoding to Save with"),
1128+
autoFocus: { autoFocusIndex: typeof directMatchIndex === 'number' ? directMatchIndex : typeof aliasMatchIndex === 'number' ? aliasMatchIndex : void 0 }
1129+
}).then(encoding => {
1130+
if (encoding) {
1131+
activeEditor = this.editorService.getActiveEditor();
1132+
encodingSupport = toEditorWithEncodingSupport(activeEditor.input);
1133+
if (encodingSupport && encodingSupport.getEncoding() !== encoding.id) {
1134+
encodingSupport.setEncoding(encoding.id, isReopenWithEncoding ? EncodingMode.Decode : EncodingMode.Encode); // Set new encoding
1135+
}
11161136
}
1117-
}
1137+
});
11181138
});
1119-
});
11201139
});
11211140
}
11221141
}

0 commit comments

Comments
 (0)