Failsafe when splitting surrogate pairs - fixes #248

This commit is contained in:
Kevin Jahns 2020-10-31 02:05:33 +01:00
parent 114f28f48e
commit e9cb07da55
2 changed files with 49 additions and 2 deletions

View File

@ -51,6 +51,17 @@ export class ContentString {
splice (offset) {
const right = new ContentString(this.str.slice(offset))
this.str = this.str.slice(0, offset)
// Prevent encoding invalid documents because of splitting of surrogate pairs: https://github.com/yjs/yjs/issues/248
const firstCharCode = this.str.charCodeAt(offset - 1)
if (firstCharCode >= 0xD800 && firstCharCode <= 0xDBFF) {
// Last character of the left split is the start of a surrogate utf16/ucs2 pair.
// We don't support splitting of surrogate pairs because this may lead to invalid documents.
// Replace the invalid character with a unicode replacement character (<28> / U+FFFD)
this.str = this.str.slice(0, offset - 1) + '<27>'
// replace right as well
right.str = '<27>' + right.str.slice(1)
}
return right
}

View File

@ -249,6 +249,8 @@ export const testAppendChars = tc => {
t.assert(text0.length === N)
}
const largeDocumentSize = 100000
const id = Y.createID(0, 0)
const c = new Y.ContentString('a')
@ -256,7 +258,7 @@ const c = new Y.ContentString('a')
* @param {t.TestCase} tc
*/
export const testBestCase = tc => {
const N = 2000000
const N = largeDocumentSize
const items = new Array(N)
t.measureTime('time to create two million items in the best case', () => {
const parent = /** @type {any} */ ({})
@ -293,7 +295,7 @@ const tryGc = () => {
* @param {t.TestCase} tc
*/
export const testLargeFragmentedDocument = tc => {
const itemsToInsert = 1000000
const itemsToInsert = largeDocumentSize
let update = /** @type {any} */ (null)
;(() => {
const doc1 = new Y.Doc()
@ -321,6 +323,40 @@ export const testLargeFragmentedDocument = tc => {
})()
}
/**
* Splitting surrogates can lead to invalid encoded documents.
*
* https://github.com/yjs/yjs/issues/248
*
* @param {t.TestCase} tc
*/
export const testSplitSurrogateCharacter = tc => {
{
const { users, text0 } = init(tc, { users: 2 })
users[1].disconnect() // disconnecting forces the user to encode the split surrogate
text0.insert(0, '👾') // insert surrogate character
// split surrogate, which should not lead to an encoding error
text0.insert(1, 'hi!')
compare(users)
}
{
const { users, text0 } = init(tc, { users: 2 })
users[1].disconnect() // disconnecting forces the user to encode the split surrogate
text0.insert(0, '👾👾') // insert surrogate character
// partially delete surrogate
text0.delete(1, 2)
compare(users)
}
{
const { users, text0 } = init(tc, { users: 2 })
users[1].disconnect() // disconnecting forces the user to encode the split surrogate
text0.insert(0, '👾👾') // insert surrogate character
// formatting will also split surrogates
text0.format(1, 2, { bold: true })
compare(users)
}
}
// RANDOM TESTS
let charCounter = 0