Use punycode to parse unicode characters correctly

This commit is contained in:
Niklas von Hertzen 2014-09-14 19:32:26 +03:00
parent b8d3688c29
commit 9d088fa431
11 changed files with 661 additions and 17 deletions

3
.gitmodules vendored
View File

@ -1,3 +1,6 @@
[submodule "src/fabric"]
path = src/fabric
url = https://github.com/kangax/fabric.js.git
[submodule "src/punycode"]
path = src/punycode
url = https://github.com/bestiejs/punycode.js.git

View File

@ -23,7 +23,7 @@ module.exports = function(grunt) {
concat: {
dist: {
src: [
'src/promise.js', 'src/fallback.js', 'src/*.js', 'src/renderers/*.js'
'src/promise.js', 'src/fallback.js', 'src/punycode/punycode.js', 'src/core.js', 'src/*.js', 'src/renderers/*.js'
],
dest: 'dist/<%= pkg.name %>.js',
options:{
@ -38,7 +38,7 @@ module.exports = function(grunt) {
dest: 'dist/<%= pkg.name %>.svg.js',
options:{
banner: meta.banner + '\n(function(window, document, exports, undefined){\n\n',
footer: '\n})(window, document, html2canvas);'
footer: '\n}).call({}, window, document, html2canvas);'
}
}
},

603
dist/html2canvas.js vendored
View File

@ -37,6 +37,535 @@ if (typeof(Object.create) !== "function" || typeof(document.createElement("canva
return;
}
/*! https://mths.be/punycode v1.3.1 by @mathias */
;(function(root) {
/** Detect free variables */
var freeExports = typeof exports == 'object' && exports &&
!exports.nodeType && exports;
var freeModule = typeof module == 'object' && module &&
!module.nodeType && module;
var freeGlobal = typeof global == 'object' && global;
if (
freeGlobal.global === freeGlobal ||
freeGlobal.window === freeGlobal ||
freeGlobal.self === freeGlobal
) {
root = freeGlobal;
}
/**
* The `punycode` object.
* @name punycode
* @type Object
*/
var punycode,
/** Highest positive signed 32-bit float value */
maxInt = 2147483647, // aka. 0x7FFFFFFF or 2^31-1
/** Bootstring parameters */
base = 36,
tMin = 1,
tMax = 26,
skew = 38,
damp = 700,
initialBias = 72,
initialN = 128, // 0x80
delimiter = '-', // '\x2D'
/** Regular expressions */
regexPunycode = /^xn--/,
regexNonASCII = /[^\x20-\x7E]/, // unprintable ASCII chars + non-ASCII chars
regexSeparators = /[\x2E\u3002\uFF0E\uFF61]/g, // RFC 3490 separators
/** Error messages */
errors = {
'overflow': 'Overflow: input needs wider integers to process',
'not-basic': 'Illegal input >= 0x80 (not a basic code point)',
'invalid-input': 'Invalid input'
},
/** Convenience shortcuts */
baseMinusTMin = base - tMin,
floor = Math.floor,
stringFromCharCode = String.fromCharCode,
/** Temporary variable */
key;
/*--------------------------------------------------------------------------*/
/**
* A generic error utility function.
* @private
* @param {String} type The error type.
* @returns {Error} Throws a `RangeError` with the applicable error message.
*/
function error(type) {
throw RangeError(errors[type]);
}
/**
* A generic `Array#map` utility function.
* @private
* @param {Array} array The array to iterate over.
* @param {Function} callback The function that gets called for every array
* item.
* @returns {Array} A new array of values returned by the callback function.
*/
function map(array, fn) {
var length = array.length;
var result = [];
while (length--) {
result[length] = fn(array[length]);
}
return result;
}
/**
* A simple `Array#map`-like wrapper to work with domain name strings or email
* addresses.
* @private
* @param {String} domain The domain name or email address.
* @param {Function} callback The function that gets called for every
* character.
* @returns {Array} A new string of characters returned by the callback
* function.
*/
function mapDomain(string, fn) {
var parts = string.split('@');
var result = '';
if (parts.length > 1) {
// In email addresses, only the domain name should be punycoded. Leave
// the local part (i.e. everything up to `@`) intact.
result = parts[0] + '@';
string = parts[1];
}
var labels = string.split(regexSeparators);
var encoded = map(labels, fn).join('.');
return result + encoded;
}
/**
* Creates an array containing the numeric code points of each Unicode
* character in the string. While JavaScript uses UCS-2 internally,
* this function will convert a pair of surrogate halves (each of which
* UCS-2 exposes as separate characters) into a single code point,
* matching UTF-16.
* @see `punycode.ucs2.encode`
* @see <https://mathiasbynens.be/notes/javascript-encoding>
* @memberOf punycode.ucs2
* @name decode
* @param {String} string The Unicode input string (UCS-2).
* @returns {Array} The new array of code points.
*/
function ucs2decode(string) {
var output = [],
counter = 0,
length = string.length,
value,
extra;
while (counter < length) {
value = string.charCodeAt(counter++);
if (value >= 0xD800 && value <= 0xDBFF && counter < length) {
// high surrogate, and there is a next character
extra = string.charCodeAt(counter++);
if ((extra & 0xFC00) == 0xDC00) { // low surrogate
output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000);
} else {
// unmatched surrogate; only append this code unit, in case the next
// code unit is the high surrogate of a surrogate pair
output.push(value);
counter--;
}
} else {
output.push(value);
}
}
return output;
}
/**
* Creates a string based on an array of numeric code points.
* @see `punycode.ucs2.decode`
* @memberOf punycode.ucs2
* @name encode
* @param {Array} codePoints The array of numeric code points.
* @returns {String} The new Unicode string (UCS-2).
*/
function ucs2encode(array) {
return map(array, function(value) {
var output = '';
if (value > 0xFFFF) {
value -= 0x10000;
output += stringFromCharCode(value >>> 10 & 0x3FF | 0xD800);
value = 0xDC00 | value & 0x3FF;
}
output += stringFromCharCode(value);
return output;
}).join('');
}
/**
* Converts a basic code point into a digit/integer.
* @see `digitToBasic()`
* @private
* @param {Number} codePoint The basic numeric code point value.
* @returns {Number} The numeric value of a basic code point (for use in
* representing integers) in the range `0` to `base - 1`, or `base` if
* the code point does not represent a value.
*/
function basicToDigit(codePoint) {
if (codePoint - 48 < 10) {
return codePoint - 22;
}
if (codePoint - 65 < 26) {
return codePoint - 65;
}
if (codePoint - 97 < 26) {
return codePoint - 97;
}
return base;
}
/**
* Converts a digit/integer into a basic code point.
* @see `basicToDigit()`
* @private
* @param {Number} digit The numeric value of a basic code point.
* @returns {Number} The basic code point whose value (when used for
* representing integers) is `digit`, which needs to be in the range
* `0` to `base - 1`. If `flag` is non-zero, the uppercase form is
* used; else, the lowercase form is used. The behavior is undefined
* if `flag` is non-zero and `digit` has no uppercase form.
*/
function digitToBasic(digit, flag) {
// 0..25 map to ASCII a..z or A..Z
// 26..35 map to ASCII 0..9
return digit + 22 + 75 * (digit < 26) - ((flag != 0) << 5);
}
/**
* Bias adaptation function as per section 3.4 of RFC 3492.
* http://tools.ietf.org/html/rfc3492#section-3.4
* @private
*/
function adapt(delta, numPoints, firstTime) {
var k = 0;
delta = firstTime ? floor(delta / damp) : delta >> 1;
delta += floor(delta / numPoints);
for (/* no initialization */; delta > baseMinusTMin * tMax >> 1; k += base) {
delta = floor(delta / baseMinusTMin);
}
return floor(k + (baseMinusTMin + 1) * delta / (delta + skew));
}
/**
* Converts a Punycode string of ASCII-only symbols to a string of Unicode
* symbols.
* @memberOf punycode
* @param {String} input The Punycode string of ASCII-only symbols.
* @returns {String} The resulting string of Unicode symbols.
*/
function decode(input) {
// Don't use UCS-2
var output = [],
inputLength = input.length,
out,
i = 0,
n = initialN,
bias = initialBias,
basic,
j,
index,
oldi,
w,
k,
digit,
t,
/** Cached calculation results */
baseMinusT;
// Handle the basic code points: let `basic` be the number of input code
// points before the last delimiter, or `0` if there is none, then copy
// the first basic code points to the output.
basic = input.lastIndexOf(delimiter);
if (basic < 0) {
basic = 0;
}
for (j = 0; j < basic; ++j) {
// if it's not a basic code point
if (input.charCodeAt(j) >= 0x80) {
error('not-basic');
}
output.push(input.charCodeAt(j));
}
// Main decoding loop: start just after the last delimiter if any basic code
// points were copied; start at the beginning otherwise.
for (index = basic > 0 ? basic + 1 : 0; index < inputLength; /* no final expression */) {
// `index` is the index of the next character to be consumed.
// Decode a generalized variable-length integer into `delta`,
// which gets added to `i`. The overflow checking is easier
// if we increase `i` as we go, then subtract off its starting
// value at the end to obtain `delta`.
for (oldi = i, w = 1, k = base; /* no condition */; k += base) {
if (index >= inputLength) {
error('invalid-input');
}
digit = basicToDigit(input.charCodeAt(index++));
if (digit >= base || digit > floor((maxInt - i) / w)) {
error('overflow');
}
i += digit * w;
t = k <= bias ? tMin : (k >= bias + tMax ? tMax : k - bias);
if (digit < t) {
break;
}
baseMinusT = base - t;
if (w > floor(maxInt / baseMinusT)) {
error('overflow');
}
w *= baseMinusT;
}
out = output.length + 1;
bias = adapt(i - oldi, out, oldi == 0);
// `i` was supposed to wrap around from `out` to `0`,
// incrementing `n` each time, so we'll fix that now:
if (floor(i / out) > maxInt - n) {
error('overflow');
}
n += floor(i / out);
i %= out;
// Insert `n` at position `i` of the output
output.splice(i++, 0, n);
}
return ucs2encode(output);
}
/**
* Converts a string of Unicode symbols (e.g. a domain name label) to a
* Punycode string of ASCII-only symbols.
* @memberOf punycode
* @param {String} input The string of Unicode symbols.
* @returns {String} The resulting Punycode string of ASCII-only symbols.
*/
function encode(input) {
var n,
delta,
handledCPCount,
basicLength,
bias,
j,
m,
q,
k,
t,
currentValue,
output = [],
/** `inputLength` will hold the number of code points in `input`. */
inputLength,
/** Cached calculation results */
handledCPCountPlusOne,
baseMinusT,
qMinusT;
// Convert the input in UCS-2 to Unicode
input = ucs2decode(input);
// Cache the length
inputLength = input.length;
// Initialize the state
n = initialN;
delta = 0;
bias = initialBias;
// Handle the basic code points
for (j = 0; j < inputLength; ++j) {
currentValue = input[j];
if (currentValue < 0x80) {
output.push(stringFromCharCode(currentValue));
}
}
handledCPCount = basicLength = output.length;
// `handledCPCount` is the number of code points that have been handled;
// `basicLength` is the number of basic code points.
// Finish the basic string - if it is not empty - with a delimiter
if (basicLength) {
output.push(delimiter);
}
// Main encoding loop:
while (handledCPCount < inputLength) {
// All non-basic code points < n have been handled already. Find the next
// larger one:
for (m = maxInt, j = 0; j < inputLength; ++j) {
currentValue = input[j];
if (currentValue >= n && currentValue < m) {
m = currentValue;
}
}
// Increase `delta` enough to advance the decoder's <n,i> state to <m,0>,
// but guard against overflow
handledCPCountPlusOne = handledCPCount + 1;
if (m - n > floor((maxInt - delta) / handledCPCountPlusOne)) {
error('overflow');
}
delta += (m - n) * handledCPCountPlusOne;
n = m;
for (j = 0; j < inputLength; ++j) {
currentValue = input[j];
if (currentValue < n && ++delta > maxInt) {
error('overflow');
}
if (currentValue == n) {
// Represent delta as a generalized variable-length integer
for (q = delta, k = base; /* no condition */; k += base) {
t = k <= bias ? tMin : (k >= bias + tMax ? tMax : k - bias);
if (q < t) {
break;
}
qMinusT = q - t;
baseMinusT = base - t;
output.push(
stringFromCharCode(digitToBasic(t + qMinusT % baseMinusT, 0))
);
q = floor(qMinusT / baseMinusT);
}
output.push(stringFromCharCode(digitToBasic(q, 0)));
bias = adapt(delta, handledCPCountPlusOne, handledCPCount == basicLength);
delta = 0;
++handledCPCount;
}
}
++delta;
++n;
}
return output.join('');
}
/**
* Converts a Punycode string representing a domain name or an email address
* to Unicode. Only the Punycoded parts of the input will be converted, i.e.
* it doesn't matter if you call it on a string that has already been
* converted to Unicode.
* @memberOf punycode
* @param {String} input The Punycoded domain name or email address to
* convert to Unicode.
* @returns {String} The Unicode representation of the given Punycode
* string.
*/
function toUnicode(input) {
return mapDomain(input, function(string) {
return regexPunycode.test(string)
? decode(string.slice(4).toLowerCase())
: string;
});
}
/**
* Converts a Unicode string representing a domain name or an email address to
* Punycode. Only the non-ASCII parts of the domain name will be converted,
* i.e. it doesn't matter if you call it with a domain that's already in
* ASCII.
* @memberOf punycode
* @param {String} input The domain name or email address to convert, as a
* Unicode string.
* @returns {String} The Punycode representation of the given domain name or
* email address.
*/
function toASCII(input) {
return mapDomain(input, function(string) {
return regexNonASCII.test(string)
? 'xn--' + encode(string)
: string;
});
}
/*--------------------------------------------------------------------------*/
/** Define the public API */
punycode = {
/**
* A string representing the current Punycode.js version number.
* @memberOf punycode
* @type String
*/
'version': '1.3.1',
/**
* An object of methods to convert from JavaScript's internal character
* representation (UCS-2) to Unicode code points, and back.
* @see <https://mathiasbynens.be/notes/javascript-encoding>
* @memberOf punycode
* @type Object
*/
'ucs2': {
'decode': ucs2decode,
'encode': ucs2encode
},
'decode': decode,
'encode': encode,
'toASCII': toASCII,
'toUnicode': toUnicode
};
/** Expose `punycode` */
// Some AMD build optimizers, like r.js, check for specific condition patterns
// like the following:
if (
typeof define == 'function' &&
typeof define.amd == 'object' &&
define.amd
) {
define('punycode', function() {
return punycode;
});
} else if (freeExports && freeModule) {
if (module.exports == freeExports) { // in Node.js or RingoJS v0.8.0+
freeModule.exports = punycode;
} else { // in Narwhal or RingoJS v0.7.0-
for (key in punycode) {
punycode.hasOwnProperty(key) && (freeExports[key] = punycode[key]);
}
}
} else { // in Rhino or a web browser
root.punycode = punycode;
}
}(this));
var html2canvasNodeAttribute = "data-html2canvas-node";
window.html2canvas = function(nodeList, options) {
@ -61,6 +590,8 @@ window.html2canvas = function(nodeList, options) {
});
};
window.html2canvas.punycode = this.punycode;
function renderDocument(document, options, windowWidth, windowHeight) {
return createWindowClone(document, windowWidth, windowHeight, options).then(function(container) {
log("Document cloned");
@ -262,6 +793,8 @@ GradientContainer.prototype.TYPES = {
RADIAL: 2
};
GradientContainer.prototype.angleRegExp = /([+-]?\d*\.?\d+)(deg|grad|rad|turn)/;
function ImageContainer(src, cors) {
this.src = src;
this.image = new Image();
@ -445,6 +978,32 @@ function LinearGradientContainer(imageData) {
this.x1 = x0;
this.y1 = y0;
break;
default:
var angle = position.match(this.angleRegExp);
if (angle) {
switch(angle[2]) {
case "deg":
var angleDeg = parseFloat(angle[1]);
var radians = angleDeg / (180 / Math.PI);
var slope = Math.tan(radians); // m
var perpendicularSlope = -1 / slope;
// y = 2
// y = m * x
// 2 = m * x
this.y0 = 2 / Math.tan(slope) / 2; // 1
// console.log(radians, angle);
this.x0 = 0;
this.x1 = 1;
this.y1 = 0;
break;
}
}
}
}, this);
} else {
@ -1165,7 +1724,11 @@ NodeParser.prototype.paintFormValue = function(container) {
NodeParser.prototype.paintText = function(container) {
container.applyTextTransform();
var textList = container.node.data.split(!this.options.letterRendering || noLetterSpacing(container) ? /(\b| )/ : "");
var characters = window.html2canvas.punycode.ucs2.decode(container.node.data);
var textList = (!this.options.letterRendering || noLetterSpacing(container)) && !hasUnicode(container.node.data) ? getWords(characters) : characters.map(function(character) {
return window.html2canvas.punycode.ucs2.encode([character]);
});
var weight = container.parent.fontWeight();
var size = container.parent.css('fontSize');
var family = container.parent.css('fontFamily');
@ -1538,6 +2101,44 @@ function stripQuotes(content) {
return (first === content.substr(content.length - 1) && first.match(/'|"/)) ? content.substr(1, content.length - 2) : content;
}
function getWords(characters) {
var words = [], i = 0, onWordBoundary = false, word;
while(characters.length) {
if (isWordBoundary(characters[i]) === onWordBoundary) {
word = characters.splice(0, i);
if (word.length) {
words.push(window.html2canvas.punycode.ucs2.encode(word));
}
onWordBoundary =! onWordBoundary;
i = 0;
} else {
i++;
}
if (i >= characters.length) {
word = characters.splice(0, i);
if (word.length) {
words.push(window.html2canvas.punycode.ucs2.encode(word));
}
}
}
return words;
}
function isWordBoundary(characterCode) {
return [
32, // <space>
13, // \r
10, // \n
9, // \t
45 // -
].indexOf(characterCode) !== -1;
}
function hasUnicode(string) {
return /[^\u0000-\u00ff]/.test(string);
}
function ProxyImageContainer(src, proxy) {
var callbackName = "html2canvas_" + proxyImageCount++;
var script = document.createElement("script");

File diff suppressed because one or more lines are too long

View File

@ -17699,4 +17699,4 @@ fabric.Image.filters.BaseFilter = fabric.util.createClass(/** @lends fabric.Imag
})(typeof exports !== 'undefined' ? exports : this);
})(window, document, html2canvas);
}).call({}, window, document, html2canvas);

File diff suppressed because one or more lines are too long

View File

@ -22,6 +22,8 @@ window.html2canvas = function(nodeList, options) {
});
};
window.html2canvas.punycode = this.punycode;
function renderDocument(document, options, windowWidth, windowHeight) {
return createWindowClone(document, windowWidth, windowHeight, options).then(function(container) {
log("Document cloned");

View File

@ -304,7 +304,11 @@ NodeParser.prototype.paintFormValue = function(container) {
NodeParser.prototype.paintText = function(container) {
container.applyTextTransform();
var textList = container.node.data.split(!this.options.letterRendering || noLetterSpacing(container) ? /(\b| )/ : "");
var characters = window.html2canvas.punycode.ucs2.decode(container.node.data);
var textList = (!this.options.letterRendering || noLetterSpacing(container)) && !hasUnicode(container.node.data) ? getWords(characters) : characters.map(function(character) {
return window.html2canvas.punycode.ucs2.encode([character]);
});
var weight = container.parent.fontWeight();
var size = container.parent.css('fontSize');
var family = container.parent.css('fontFamily');
@ -676,3 +680,41 @@ function stripQuotes(content) {
var first = content.substr(0, 1);
return (first === content.substr(content.length - 1) && first.match(/'|"/)) ? content.substr(1, content.length - 2) : content;
}
function getWords(characters) {
var words = [], i = 0, onWordBoundary = false, word;
while(characters.length) {
if (isWordBoundary(characters[i]) === onWordBoundary) {
word = characters.splice(0, i);
if (word.length) {
words.push(window.html2canvas.punycode.ucs2.encode(word));
}
onWordBoundary =! onWordBoundary;
i = 0;
} else {
i++;
}
if (i >= characters.length) {
word = characters.splice(0, i);
if (word.length) {
words.push(window.html2canvas.punycode.ucs2.encode(word));
}
}
}
return words;
}
function isWordBoundary(characterCode) {
return [
32, // <space>
13, // \r
10, // \n
9, // \t
45 // -
].indexOf(characterCode) !== -1;
}
function hasUnicode(string) {
return /[^\u0000-\u00ff]/.test(string);
}

1
src/punycode Submodule

@ -0,0 +1 @@
Subproject commit 5c04e6fe9560bac8d38a0bcdf2d2c8581f5fc330

View File

@ -4,11 +4,6 @@
<title>Chinese text</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<script type="text/javascript" src="../../test.js"></script>
<script>
h2cOptions = {
chinese: true
}
</script>
<style>
.chn-text-block {
width: 500px;
@ -38,4 +33,4 @@
</p><p>&nbsp;&nbsp;&nbsp;&nbsp;14 见鲁迅《集外集·自嘲》《鲁迅全集》第7卷人民文学出版社1981年版第147页</p>
</div>
</body>
</html>
</html>

View File

@ -11,8 +11,8 @@ var h2cSelector, h2cOptions;
document.write('<script type="text/javascript" src="' + src + '.js?' + Math.random() + '"></script>');
}
var sources = ['log', 'nodecontainer', 'stackingcontext', 'textcontainer', 'support', 'imagecontainer', 'dummyimagecontainer', 'proxyimagecontainer', 'gradientcontainer',
'lineargradientcontainer', 'webkitgradientcontainer', 'svgcontainer', 'svgnodecontainer', 'imageloader', 'nodeparser', 'font', 'fontmetrics', 'core', 'renderer', 'promise', 'xhr', 'renderers/canvas'];
var sources = ['log', 'punycode/punycode', 'core', 'nodecontainer', 'stackingcontext', 'textcontainer', 'support', 'imagecontainer', 'dummyimagecontainer', 'proxyimagecontainer', 'gradientcontainer',
'lineargradientcontainer', 'webkitgradientcontainer', 'svgcontainer', 'svgnodecontainer', 'imageloader', 'nodeparser', 'font', 'fontmetrics', 'renderer', 'promise', 'xhr', 'renderers/canvas'];
['/tests/assets/jquery-1.6.2'].concat(window.location.search === "?selenium" ? ['/dist/html2canvas'] : sources.map(function(src) { return '/src/' + src; })).forEach(appendScript);