Attachment #393691 for bug #208998

View Details Raw Unified Return to Bug 208998

| Differences between



2020-03-16  Keith Miller  <keith_miller@apple.com>

        JavaScript identifier grammar supports unescaped astral symbols, but JSC doesn’t
        https://bugs.webkit.org/show_bug.cgi?id=208998

        Reviewed by NOBODY (OOPS!).

        This patch fixes a bug in the parser that allows for surrogate pairs when parsing identifiers.
        It also makes a few other changes to the parser:

        1) When looking for keywords we just need to check that subsequent
        character cannot be a identifier part or an escape start.

        2) The only time we call parseIdentifierSlowCase is when we hit an
        escape start or a surrogate pair so we can optimize that to just
        copy everything up slow character into our buffer.

        3) We shouldn't allow for asking if a UChar is an identifier start/part.

        * KeywordLookupGenerator.py:
        (Trie.printSubTreeAsC):
        (Trie.printAsC):
        * parser/Lexer.cpp:
        (JSC::isNonLatin1IdentStart):
        (JSC::isIdentStart):
        (JSC::isSingleCharacterIdentStart):
        (JSC::cannotBeIdentStart):
        (JSC::isIdentPart):
        (JSC::isSingleCharacterIdentPart):
        (JSC::cannotBeIdentPartOrEscapeStart):
        (JSC::Lexer<LChar>::currentCodePoint const):
        (JSC::Lexer<UChar>::currentCodePoint const):
        (JSC::Lexer<LChar>::parseIdentifier):
        (JSC::Lexer<UChar>::parseIdentifier):
        (JSC::Lexer<CharacterType>::parseIdentifierSlowCase):
        (JSC::Lexer<T>::lexWithoutClearingLineTerminator):
        (JSC::Lexer<T>::scanRegExp):
        (JSC::isIdentPartIncludingEscapeTemplate): Deleted.
        (JSC::isIdentPartIncludingEscape): Deleted.
        * parser/Lexer.h:
        (JSC::Lexer::setOffsetFromSourcePtr): Deleted.
        * parser/Parser.cpp:
        (JSC::Parser<LexerType>::printUnexpectedTokenText):
        * parser/ParserTokens.h:

2020-03-13  Alexey Shvayka  <shvaikalesh@gmail.com>

        Bound functions should pass correct NewTarget value



2020-03-16  Keith Miller  <keith_miller@apple.com>

        JavaScript identifier grammar supports unescaped astral symbols, but JSC doesn’t
        https://bugs.webkit.org/show_bug.cgi?id=208998

        Reviewed by NOBODY (OOPS!).

        * wtf/text/WTFString.cpp:
        (WTF::String::fromCodePoint):
        * wtf/text/WTFString.h:

2020-03-11  Jer Noble  <jer.noble@apple.com>

        Adopt AVSampleBufferVideoOutput



        str = makePadding(indent)

        if self.value != None:
            print(str + "if (LIKELY(cannotBeIdentPartOrEscapeStart(code[%d]))) {" % (len(self.fullPrefix)))
            print(str + "    internalShift<%d>();" % len(self.fullPrefix))
            print(str + "    if (shouldCreateIdentifier)")
            print(str + ("        data->ident = &m_vm.propertyNames->%sKeyword;" % self.fullPrefix))

    def printAsC(self):
        print("namespace JSC {")
        print("")
        print("static ALWAYS_INLINE bool cannotBeIdentPartOrEscapeStart(LChar);")
        print("static ALWAYS_INLINE bool cannotBeIdentPartOrEscapeStart(UChar);")
        # max length + 1 so we don't need to do any bounds checking at all
        print("static constexpr int maxTokenLength = %d;" % (self.maxLength() + 1))
        print("")



        shift();
}

static bool isNonLatin1IdentStart(UChar32 c)
{
    return u_hasBinaryProperty(c, UCHAR_ID_START);
}

template<typename CharacterType>
static ALWAYS_INLINE bool isIdentStart(CharacterType c)
{
    static_assert(std::is_same_v<CharacterType, LChar> || std::is_same_v<CharacterType, UChar32>, "Call isSingleCharacterIdentStart for UChars that don't need to check for surrogate pairs");
    if (!isLatin1(c))
        return isNonLatin1IdentStart(c);
    return typesOfLatin1Characters[static_cast<LChar>(c)] == CharacterIdentifierStart;
}

static ALWAYS_INLINE bool isSingleCharacterIdentStart(UChar c)
{
    if (LIKELY(isLatin1(c)))
        return isIdentStart(static_cast<LChar>(c));
    return !U16_IS_SURROGATE(c) && isIdentStart(static_cast<UChar32>(c));
}

static ALWAYS_INLINE bool cannotBeIdentStart(LChar c)
{
    return !isIdentStart(c) && c != '\\';
}

static ALWAYS_INLINE bool cannotBeIdentStart(UChar c)
{
    if (LIKELY(isLatin1(c)))
        return cannotBeIdentStart(static_cast<LChar>(c));
    return Lexer<UChar>::isWhiteSpace(c) || Lexer<UChar>::isLineTerminator(c);
}

static NEVER_INLINE bool isNonLatin1IdentPart(UChar32 c)

    return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE) || c == 0x200C || c == 0x200D;
}

template<typename CharacterType>
static ALWAYS_INLINE bool isIdentPart(CharacterType c)
{
    static_assert(std::is_same_v<CharacterType, LChar> || std::is_same_v<CharacterType, UChar32>, "Call isSingleCharacterIdentPart for UChars that don't need to check for surrogate pairs");
    if (!isLatin1(c))
        return isNonLatin1IdentPart(c);

    // Character types are divided into two groups depending on whether they can be part of an
    // identifier or not. Those whose type value is less or equal than CharacterOtherIdentifierPart can be
    // part of an identifier. (See the CharacterType definition for more details.)
    return typesOfLatin1Characters[static_cast<LChar>(c)] <= CharacterOtherIdentifierPart;
}

static ALWAYS_INLINE bool isSingleCharacterIdentPart(UChar c)
{
    if (LIKELY(isLatin1(c)))
        return isIdentPart(static_cast<LChar>(c));
    return !U16_IS_SURROGATE(c) && isIdentPart(static_cast<UChar32>(c));
}

static ALWAYS_INLINE bool cannotBeIdentPartOrEscapeStart(LChar c)
{
    return !isIdentPart(c) && c != '\\';
}

// NOTE: This may give give false negatives (for non-ascii) but won't give false posititves.
// This means it can be used to detect the end of a keyword (all keywords are ascii)
static ALWAYS_INLINE bool cannotBeIdentPartOrEscapeStart(UChar c)
{
    if (LIKELY(isLatin1(c)))
        return cannotBeIdentPartOrEscapeStart(static_cast<LChar>(c));
    return Lexer<UChar>::isWhiteSpace(c) || Lexer<UChar>::isLineTerminator(c);
    // Shortest sequence handled below is \u{0}, which is 5 characters.
    if (!(code[0] == '\\' && codeEnd - code >= 5 && code[1] == 'u'))
        return false;

    if (code[2] == '{') {
        UChar32 codePoint = 0;
        const CharacterType* pointer;
        for (pointer = &code[3]; pointer < codeEnd; ++pointer) {
            auto digit = *pointer;
            if (!isASCIIHexDigit(digit))
                break;
            codePoint = (codePoint << 4) | toASCIIHexValue(digit);
            if (codePoint > UCHAR_MAX_VALUE)
                return false;
        }
        return isIdentPart(codePoint) && pointer < codeEnd && *pointer == '}';
    }

    // Shortest sequence handled below is \uXXXX, which is 6 characters.
    if (codeEnd - code < 6)
        return false;

    auto character1 = code[2];
    auto character2 = code[3];
    auto character3 = code[4];
    auto character4 = code[5];
    return isASCIIHexDigit(character1) && isASCIIHexDigit(character2) && isASCIIHexDigit(character3) && isASCIIHexDigit(character4)
        && isIdentPart(Lexer<LChar>::convertUnicode(character1, character2, character3, character4));
}


template<>
ALWAYS_INLINE UChar32 Lexer<LChar>::currentCodePoint() const
{
    return m_current;
}

template<>
ALWAYS_INLINE UChar32 Lexer<UChar>::currentCodePoint() const
{
    ASSERT_WITH_MESSAGE(!isIdentStart(static_cast<UChar32>(U_SENTINEL)), "error values shouldn't appear as a valid identifier start code point");
    if (!U16_IS_SURROGATE(m_current))
        return m_current;

    UChar trail = peek(1);
    if (UNLIKELY(!U16_IS_LEAD(m_current) || !U16_IS_SURROGATE_TRAIL(trail)))
        return U_SENTINEL;

    UChar32 codePoint = U16_GET_SUPPLEMENTARY(m_current, trail);
    return codePoint;
}

template<typename CharacterType>

    }
    
    const LChar* identifierStart = currentSourcePtr();
    ASSERT(isIdentStart(m_current) || m_current == '\\');
    
    while (isIdentPart(m_current))
        shift();
    
    if (UNLIKELY(m_current == '\\'))
        return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode, identifierStart);
        return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode);
    }

    const Identifier* ident = nullptr;
    

template <>
template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType Lexer<UChar>::parseIdentifier(JSTokenData* tokenData, OptionSet<LexerFlags> lexerFlags, bool strictMode)
{
    ASSERT(!m_parsingBuiltinFunction);
    tokenData->escaped = false;
    const ptrdiff_t remaining = m_codeEnd - m_code;
    if ((remaining >= maxTokenLength) && !lexerFlags.contains(LexerFlags::IgnoreReservedWords)) {

            return keyword == RESERVED_IF_STRICT && !strictMode ? IDENT : keyword;
        }
    }
    
    bool isPrivateName = m_current == '@' && m_parsingBuiltinFunction;
    bool isWellKnownSymbol = false;
    if (isPrivateName) {
        ASSERT(m_parsingBuiltinFunction);
        shift();
        if (m_current == '@') {
            isWellKnownSymbol = true;
            shift();
        }
    }


    const UChar* identifierStart = currentSourcePtr();
    int identifierLineStart = currentLineStartOffset();

    UChar orAllChars = 0;
    ASSERT(isSingleCharacterIdentStart(m_current) || U16_IS_SURROGATE(m_current) || m_current == '\\');
    while (isSingleCharacterIdentPart(m_current)) {
        orAllChars |= m_current;
        shift();
    }
    
    if (UNLIKELY(U16_IS_SURROGATE(m_current) || m_current == '\\'))
        return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode, identifierStart);
        setOffsetFromSourcePtr(identifierStart, identifierLineStart);
        return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode);
    }

    bool isAll8Bit = false;

    if (!(orAllChars & ~0xff))
        isAll8Bit = true;

    bool isAll8Bit = !(orAllChars & ~0xff);
    const Identifier* ident = nullptr;
    
    if (shouldCreateIdentifier) {
        int identifierLength = currentSourcePtr() - identifierStart;
        if (isAll8Bit)
            ident = makeIdentifierLCharFromUChar(identifierStart, identifierLength);
        else
            ident = makeIdentifier(identifierStart, identifierLength);
                ident = &m_arena->makeIdentifier(m_vm, m_vm.propertyNames->builtinNames().lookUpPrivateName(identifierStart, identifierLength));
            if (!ident)
                return INVALID_PRIVATE_NAME_ERRORTOK;
        } else {
            if (isAll8Bit)
                ident = makeIdentifierLCharFromUChar(identifierStart, identifierLength);
            else
                ident = makeIdentifier(identifierStart, identifierLength);
            if (m_parsingBuiltinFunction) {
                if (!isSafeBuiltinIdentifier(m_vm, ident)) {
                    m_lexErrorMessage = makeString("The use of '", ident->string(), "' is disallowed in builtin functions.");
                    return ERRORTOK;
                }
                if (*ident == m_vm.propertyNames->undefinedKeyword)
                    tokenData->ident = &m_vm.propertyNames->undefinedPrivateName;
            }
        }
        tokenData->ident = ident;
    } else
        tokenData->ident = nullptr;
    
    if (UNLIKELY((remaining < maxTokenLength) && !lexerFlags.contains(LexerFlags::IgnoreReservedWords))) {
        ASSERT(shouldCreateIdentifier);
        if (remaining < maxTokenLength) {
            const HashTableValue* entry = JSC::mainTable.entry(*ident);

    return IDENT;
}

template<typename CharacterType>
template<bool shouldCreateIdentifier>
JSTokenType Lexer<CharacterType>::parseIdentifierSlowCase(JSTokenData* tokenData, OptionSet<LexerFlags> lexerFlags, bool strictMode, const CharacterType* identifierStart)
{
    ASSERT(U16_IS_SURROGATE(m_current) || m_current == '\\');
    ASSERT(m_buffer16.isEmpty());
    ASSERT(!tokenData->escaped);

    auto fillBuffer = [&] (bool isStart = false) {
        // \uXXXX unicode characters or Surrogate pairs.
        if (identifierStart != currentSourcePtr())
            m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart);

        if (m_current == '\\') {
            tokenData->escaped = true;
            shift();
            if (UNLIKELY(m_current != 'u'))
                return atEnd() ? UNTERMINATED_IDENTIFIER_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_ESCAPE_ERRORTOK;
            shift();
            auto character = parseUnicodeEscape();
            if (UNLIKELY(!character.isValid()))
                return character.isIncomplete() ? UNTERMINATED_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
            if (UNLIKELY(isStart ? !isIdentStart(character.value()) : !isIdentPart(character.value())))
                return INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
            if (shouldCreateIdentifier)
                recordUnicodeCodePoint(character.value());
            identifierStart = currentSourcePtr();
            return IDENT;
        }
        if (LIKELY(m_current != '\\'))
            break;

        ASSERT(U16_IS_SURROGATE(m_current));
        if (UNLIKELY(!U16_IS_SURROGATE_LEAD(m_current)))
            return INVALID_UNICODE_ENCODING_ERRORTOK;

        UChar32 codePoint = currentCodePoint();
        if (UNLIKELY(codePoint == U_SENTINEL))
            return INVALID_UNICODE_ENCODING_ERRORTOK;
        if (UNLIKELY(isStart ? !isNonLatin1IdentStart(codePoint) : !isNonLatin1IdentPart(codePoint)))
            return INVALID_IDENTIFIER_UNICODE_ERRORTOK;
        append16(m_code, 2);
        shift();
        if (UNLIKELY(m_current != 'u'))
            return atEnd() ? UNTERMINATED_IDENTIFIER_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_ESCAPE_ERRORTOK;
        shift();
        auto character = parseUnicodeEscape();
        if (UNLIKELY(!character.isValid()))
            return character.isIncomplete() ? UNTERMINATED_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
        if (UNLIKELY(m_buffer16.size() ? !isIdentPart(character.value()) : !isIdentStart(character.value())))
            return INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
        if (shouldCreateIdentifier)
            recordUnicodeCodePoint(character.value());
        identifierStart = currentSourcePtr();
        return IDENT;
    };

    JSTokenType type = fillBuffer(identifierStart == currentSourcePtr());
    if (UNLIKELY(type & ErrorTokenFlag))
        return type;

    while (true) {
        if (LIKELY(isSingleCharacterIdentPart(m_current))) {
            shift();
            continue;
        }
        if (!U16_IS_SURROGATE(m_current) && m_current != '\\')
            break;

        type = fillBuffer();
        if (UNLIKELY(type & ErrorTokenFlag))
            return type;
    }

    int identifierLength;
    const Identifier* ident = nullptr;
    if (shouldCreateIdentifier) {
        if (identifierStart != currentSourcePtr())
            m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart);
        ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
        } else {
            if (identifierStart != currentSourcePtr())
                m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart);
            ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
        }

        tokenData->ident = ident;
    } else

            return IDENT;
        JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
        if ((token != RESERVED_IF_STRICT) || strictMode)
            return UNEXPECTED_ESCAPE_ERRORTOK;
    }

    return IDENT;

    CharacterType type;
    if (LIKELY(isLatin1(m_current)))
        type = static_cast<CharacterType>(typesOfLatin1Characters[m_current]);
    else {
        UChar32 codePoint;
        U16_GET(m_code, 0, 0, m_codeEnd - m_code, codePoint);
        if (isNonLatin1IdentStart(codePoint))
            type = CharacterIdentifierStart;
        else if (isLineTerminator(m_current))
            type = CharacterLineTerminator;
        else
            type = CharacterInvalid;
    }

    switch (type) {
    case CharacterGreater:

        if (token == INTEGER)
            token = tokenTypeForIntegerLikeToken(tokenData->doubleValue);

        if (LIKELY(cannotBeIdentStart(m_current))) {
            m_buffer8.shrink(0);
            break;
        }

        if (UNLIKELY(isIdentStart(currentCodePoint()))) {
            m_lexErrorMessage = "No identifiers allowed directly after numeric literal"_s;
            token = atEnd() ? UNTERMINATED_NUMERIC_LITERAL_ERRORTOK : INVALID_NUMERIC_LITERAL_ERRORTOK;
            goto returnError;

                tokenData->radix = 16;
            }

            if (LIKELY(cannotBeIdentStart(m_current))) {
                if (LIKELY(token != BIGINT))
                    token = tokenTypeForIntegerLikeToken(tokenData->doubleValue);
                m_buffer8.shrink(0);
                break;
            }

            if (UNLIKELY(isIdentStart(currentCodePoint()))) {
                m_lexErrorMessage = "No space between hexadecimal literal and identifier"_s;
                token = UNTERMINATED_HEX_NUMBER_ERRORTOK;
                goto returnError;

                tokenData->radix = 2;
            }

            if (LIKELY(cannotBeIdentStart(m_current))) {
                if (LIKELY(token != BIGINT))
                    token = tokenTypeForIntegerLikeToken(tokenData->doubleValue);
                m_buffer8.shrink(0);
                break;
            }

            if (UNLIKELY(isIdentStart(currentCodePoint()))) {
                m_lexErrorMessage = "No space between binary literal and identifier"_s;
                token = UNTERMINATED_BINARY_NUMBER_ERRORTOK;
                goto returnError;

                tokenData->radix = 8;
            }

            if (LIKELY(cannotBeIdentStart(m_current))) {
                if (LIKELY(token != BIGINT))
                    token = tokenTypeForIntegerLikeToken(tokenData->doubleValue);
                m_buffer8.shrink(0);
                break;
            }

            if (UNLIKELY(isIdentStart(currentCodePoint()))) {
                m_lexErrorMessage = "No space between octal literal and identifier"_s;
                token = UNTERMINATED_OCTAL_NUMBER_ERRORTOK;
                goto returnError;

            }
        }

        if (LIKELY(cannotBeIdentStart(m_current))) {
            m_buffer8.shrink(0);
            break;
        }

        if (UNLIKELY(isIdentStart(currentCodePoint()))) {
            m_lexErrorMessage = "No identifiers allowed directly after numeric literal"_s;
            token = atEnd() ? UNTERMINATED_NUMERIC_LITERAL_ERRORTOK : INVALID_NUMERIC_LITERAL_ERRORTOK;
            goto returnError;

        token = STRING;
        break;
        }
    case CharacterIdentifierStart: {
        if constexpr (ASSERT_ENABLED) {
            UChar32 codePoint;
            U16_GET(m_code, 0, 0, m_codeEnd - m_code, codePoint);
            ASSERT(isIdentStart(codePoint));
        }
        FALLTHROUGH;
    }
    case CharacterBackSlash:
        parseIdent:
        if (lexerFlags.contains(LexerFlags::DontBuildKeywords))

    }

    tokenData->pattern = makeRightSizedIdentifier(m_buffer16.data(), m_buffer16.size(), charactersOredTogether);

    m_buffer16.shrink(0);
    charactersOredTogether = 0;

    ASSERT(m_buffer8.isEmpty());
    while (LIKELY(isLatin1(m_current)) && isIdentPart(static_cast<LChar>(m_current))) {
        record8(static_cast<LChar>(m_current));
        shift();
    }

    // FIXME: This should probably not be a lex error but dealing with surrogate pairs here is annoying and it's going to be an error anyway...
    if (UNLIKELY(!isLatin1(m_current))) {
        m_buffer8.shrink(0);
        JSTokenType token = INVALID_IDENTIFIER_UNICODE_ERRORTOK;
        fillTokenInfo(tokenRecord, token, m_lineNumber, currentOffset(), currentLineStartOffset(), currentPosition());
        m_error = true;
        String codePoint = String::fromCodePoint(currentCodePoint());
        if (!codePoint)
            codePoint = "`invalid unicode character`";
        m_lexErrorMessage = makeString("Invalid non-latin character in RexExp literal's flags '", getToken(*tokenRecord), codePoint, "'");
        return token;
    }

    tokenData->flags = makeIdentifier(m_buffer8.data(), m_buffer8.size());
    m_buffer8.shrink(0);

    // Since RegExp always ends with /, m_atLineStart always becomes false.
    m_atLineStart = false;



    void append16(const LChar*, size_t);
    void append16(const UChar* characters, size_t length) { m_buffer16.append(characters, length); }

    UChar32 currentCodePoint() const;
    ALWAYS_INLINE void shift();
    ALWAYS_INLINE bool atEnd() const;
    ALWAYS_INLINE T peek(int offset) const;


    String invalidCharacterMessage() const;
    ALWAYS_INLINE const T* currentSourcePtr() const;
    ALWAYS_INLINE void setOffsetFromSourcePtr(const T* sourcePtr, unsigned lineStartOffset) { setOffset(offsetFromSourcePtr(sourcePtr), lineStartOffset); }

    ALWAYS_INLINE void setCodeStart(const StringView&);


    template <int shiftAmount> void internalShift();
    template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType parseKeyword(JSTokenData*);
    template <bool shouldBuildIdentifiers> ALWAYS_INLINE JSTokenType parseIdentifier(JSTokenData*, OptionSet<LexerFlags>, bool strictMode);
    template <bool shouldBuildIdentifiers> NEVER_INLINE JSTokenType parseIdentifierSlowCase(JSTokenData*, OptionSet<LexerFlags>, bool strictMode, const T* identifierStart);
    enum StringParseResult {
        StringParsedSuccessfully,
        StringUnterminated,

Lines 5220-5225 template <typename LexerType> void Parser<LexerType>::printUnexpectedTokenText(W a/Source/JavaScriptCore/parser/Parser.cpp_sec1
- a/Source/JavaScriptCore/parser/Parser.cpp +6 lines
5220	case INVALID_STRING_LITERAL_ERRORTOK:	5220	case INVALID_STRING_LITERAL_ERRORTOK:
5221	out.print("Invalid string literal: '", getToken(), "'");	5221	out.print("Invalid string literal: '", getToken(), "'");
5222	return;	5222	return;
		5223	case INVALID_UNICODE_ENCODING_ERRORTOK:
		5224	out.print("Invalid unicode encoding: '", getToken(), "'");
		5225	return;
		5226	case INVALID_IDENTIFIER_UNICODE_ERRORTOK:
		5227	out.print("Invalid unicode code point in identifier: '", getToken(), "'");
		5228	return;
5223	case ERRORTOK:	5229	case ERRORTOK:
5224	out.print("Unrecognized token '", getToken(), "'");	5230	out.print("Unrecognized token '", getToken(), "'");
5225	return;	5231	return;



class Identifier;

enum {
    // Token Bitfield: 0b000000000RTE00IIIIPPPPKUXXXXXXXX
    // R = right-associative bit
    // T = unterminated error flag
    // E = error flag

    // U = unary operator flag
    //
    // We must keep the upper 8bit (1byte) region empty. JSTokenType must be 24bits.
    UnaryOpTokenFlag = 1 << 8,
    KeywordTokenFlag = 1 << 9,
    BinaryOpTokenPrecedenceShift = 10,
    BinaryOpTokenAllowsInPrecedenceAdditionalShift = 4,
    BinaryOpTokenPrecedenceMask = 15 << BinaryOpTokenPrecedenceShift,
    ErrorTokenFlag = 1 << (BinaryOpTokenAllowsInPrecedenceAdditionalShift + BinaryOpTokenPrecedenceShift + 6),
    UnterminatedErrorTokenFlag = ErrorTokenFlag << 1,
    RightAssociativeBinaryOpTokenFlag = UnterminatedErrorTokenFlag << 1
};

    UNTERMINATED_REGEXP_LITERAL_ERRORTOK = 14 | ErrorTokenFlag | UnterminatedErrorTokenFlag,
    INVALID_TEMPLATE_LITERAL_ERRORTOK = 15 | ErrorTokenFlag,
    UNEXPECTED_ESCAPE_ERRORTOK = 16 | ErrorTokenFlag,
    INVALID_UNICODE_ENCODING_ERRORTOK = 17 | ErrorTokenFlag,
    INVALID_IDENTIFIER_UNICODE_ERRORTOK = 18 | ErrorTokenFlag,
};
static_assert(static_cast<unsigned>(POW) <= 0x00ffffffU, "JSTokenType must be 24bits.");


Lines 888-893 String String::fromUTF8WithLatin1Fallback(const LChar* string, size_t size) a/Source/WTF/wtf/text/WTFString.cpp_sec1
- a/Source/WTF/wtf/text/WTFString.cpp +9 lines
888	return utf8;	888	return utf8;
889	}	889	}
890		890
		891	String String::fromCodePoint(UChar32 codePoint)
		892	{
		893	UChar buffer[2];
		894	uint8_t length = 0;
		895	UBool error = false;
		896	U16_APPEND(buffer, length, 2, codePoint, error);
		897	return error ? String() : String(buffer, length);
		898	}
		899
891	// String Operations	900	// String Operations
892	template<typename CharacterType>	901	template<typename CharacterType>
893	static unsigned lengthOfCharactersAsInteger(const CharacterType* data, size_t length)	902	static unsigned lengthOfCharactersAsInteger(const CharacterType* data, size_t length)

Lines 356-361 public: a/Source/WTF/wtf/text/WTFString.h_sec1
- a/Source/WTF/wtf/text/WTFString.h +2 lines
356	WTF_EXPORT_PRIVATE static String fromUTF8WithLatin1Fallback(const LChar*, size_t);	356	WTF_EXPORT_PRIVATE static String fromUTF8WithLatin1Fallback(const LChar*, size_t);
357	static String fromUTF8WithLatin1Fallback(const char* characters, size_t length) { return fromUTF8WithLatin1Fallback(reinterpret_cast<const LChar*>(characters), length); };	357	static String fromUTF8WithLatin1Fallback(const char* characters, size_t length) { return fromUTF8WithLatin1Fallback(reinterpret_cast<const LChar*>(characters), length); };
358		358
		359	WTF_EXPORT_PRIVATE static String fromCodePoint(UChar32 codePoint);
		360
359	// Determines the writing direction using the Unicode Bidi Algorithm rules P2 and P3.	361	// Determines the writing direction using the Unicode Bidi Algorithm rules P2 and P3.
360	UCharDirection defaultWritingDirection(bool* hasStrongDirectionality = nullptr) const;	362	UCharDirection defaultWritingDirection(bool* hasStrongDirectionality = nullptr) const;
361		363



2020-03-16  Keith Miller  <keith_miller@apple.com>

        JavaScript identifier grammar supports unescaped astral symbols, but JSC doesn’t
        https://bugs.webkit.org/show_bug.cgi?id=208998

        Reviewed by NOBODY (OOPS!).

        * stress/unicode-identifiers-with-surrogate-pairs.js: Added.
        (let.c.of.chars.eval.foo):
        (throwsSyntaxError):
        (let.c.of.continueChars.throwsSyntaxError.foo):

2020-03-13  Alexey Shvayka  <shvaikalesh@gmail.com>

        Bound functions should pass correct NewTarget value




let chars = ["鴬", "𐊧", "Ϊ"];
let continueChars =  [unescape("\u0311"), String.fromCharCode(...[0xDB40, 0xDD96])];

let o = { };
for (let c of chars) {
    eval(`var ${c};`);
    eval(`function foo() { var ${c} }`);
    eval(`o.${c}`);
}

function throwsSyntaxError(string) {
    try {
        eval(string);
    } catch (e) {
        if (!(e instanceof SyntaxError))
            throw new Error(string);
        return;
    }
    throw new Error(string);
}

for (let c of continueChars) {
    throwsSyntaxError(`var ${c}`);
    throwsSyntaxError(`function foo() { var ${c} }`);
    throwsSyntaxError(`o.${c}`);
    eval(`var ${("a" + c)}`);
    eval(`o.${"a" + c}`);

}

Return to Bug 208998

Lines 141-147 class Trie: a/Source/JavaScriptCore/KeywordLookupGenerator.py_sec1
- a/Source/JavaScriptCore/KeywordLookupGenerator.py -3 / +3 lines
141	str = makePadding(indent)	141	str = makePadding(indent)
142		142
143	if self.value != None:	143	if self.value != None:
144	print(str + "if (!isIdentPartIncludingEscape(code+%d, m_codeEnd)) {" % (len(self.fullPrefix)))	144	print(str + "if (LIKELY(cannotBeIdentPartOrEscapeStart(code[%d]))) {" % (len(self.fullPrefix)))
145	print(str + " internalShift<%d>();" % len(self.fullPrefix))	145	print(str + " internalShift<%d>();" % len(self.fullPrefix))
146	print(str + " if (shouldCreateIdentifier)")	146	print(str + " if (shouldCreateIdentifier)")
147	print(str + (" data->ident = &m_vm.propertyNames->%sKeyword;" % self.fullPrefix))	147	print(str + (" data->ident = &m_vm.propertyNames->%sKeyword;" % self.fullPrefix))
Lines 184-191 class Trie: a/Source/JavaScriptCore/KeywordLookupGenerator.py_sec2
184	def printAsC(self):	184	def printAsC(self):
185	print("namespace JSC {")	185	print("namespace JSC {")
186	print("")	186	print("")
187	print("static ALWAYS_INLINE bool isIdentPartIncludingEscape(const LChar* code, const LChar* codeEnd);")	187	print("static ALWAYS_INLINE bool cannotBeIdentPartOrEscapeStart(LChar);")
188	print("static ALWAYS_INLINE bool isIdentPartIncludingEscape(const UChar* code, const UChar* codeEnd);")	188	print("static ALWAYS_INLINE bool cannotBeIdentPartOrEscapeStart(UChar);")
189	# max length + 1 so we don't need to do any bounds checking at all	189	# max length + 1 so we don't need to do any bounds checking at all
190	print("static constexpr int maxTokenLength = %d;" % (self.maxLength() + 1))	190	print("static constexpr int maxTokenLength = %d;" % (self.maxLength() + 1))
191	print("")	191	print("")

Lines 135-140 private: a/Source/JavaScriptCore/parser/Lexer.h_sec1
- a/Source/JavaScriptCore/parser/Lexer.h -2 / +2 lines
135	void append16(const LChar*, size_t);	135	void append16(const LChar*, size_t);
136	void append16(const UChar* characters, size_t length) { m_buffer16.append(characters, length); }	136	void append16(const UChar* characters, size_t length) { m_buffer16.append(characters, length); }
137		137
		138	UChar32 currentCodePoint() const;
138	ALWAYS_INLINE void shift();	139	ALWAYS_INLINE void shift();
139	ALWAYS_INLINE bool atEnd() const;	140	ALWAYS_INLINE bool atEnd() const;
140	ALWAYS_INLINE T peek(int offset) const;	141	ALWAYS_INLINE T peek(int offset) const;
Lines 147-153 private: a/Source/JavaScriptCore/parser/Lexer.h_sec2
147		148
148	String invalidCharacterMessage() const;	149	String invalidCharacterMessage() const;
149	ALWAYS_INLINE const T* currentSourcePtr() const;	150	ALWAYS_INLINE const T* currentSourcePtr() const;
150	ALWAYS_INLINE void setOffsetFromSourcePtr(const T* sourcePtr, unsigned lineStartOffset) { setOffset(offsetFromSourcePtr(sourcePtr), lineStartOffset); }
151		151
152	ALWAYS_INLINE void setCodeStart(const StringView&);	152	ALWAYS_INLINE void setCodeStart(const StringView&);
153		153
Lines 166-172 private: a/Source/JavaScriptCore/parser/Lexer.h_sec3
166	template <int shiftAmount> void internalShift();	166	template <int shiftAmount> void internalShift();
167	template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType parseKeyword(JSTokenData*);	167	template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType parseKeyword(JSTokenData*);
168	template <bool shouldBuildIdentifiers> ALWAYS_INLINE JSTokenType parseIdentifier(JSTokenData*, OptionSet<LexerFlags>, bool strictMode);	168	template <bool shouldBuildIdentifiers> ALWAYS_INLINE JSTokenType parseIdentifier(JSTokenData*, OptionSet<LexerFlags>, bool strictMode);
169	template <bool shouldBuildIdentifiers> NEVER_INLINE JSTokenType parseIdentifierSlowCase(JSTokenData*, OptionSet<LexerFlags>, bool strictMode);	169	template <bool shouldBuildIdentifiers> NEVER_INLINE JSTokenType parseIdentifierSlowCase(JSTokenData, OptionSet<LexerFlags>, bool strictMode, const T identifierStart);
170	enum StringParseResult {	170	enum StringParseResult {
171	StringParsedSuccessfully,	171	StringParsedSuccessfully,
172	StringUnterminated,	172	StringUnterminated,

Lines 33-39 namespace JSC { a/Source/JavaScriptCore/parser/ParserTokens.h_sec1
- a/Source/JavaScriptCore/parser/ParserTokens.h -5 / +7 lines
33	class Identifier;	33	class Identifier;
34		34
35	enum {	35	enum {
36	// Token Bitfield: 0b000000000RTE000IIIIPPPPKUXXXXXXX	36	// Token Bitfield: 0b000000000RTE00IIIIPPPPKUXXXXXXXX
37	// R = right-associative bit	37	// R = right-associative bit
38	// T = unterminated error flag	38	// T = unterminated error flag
39	// E = error flag	39	// E = error flag
Lines 43-54 enum { a/Source/JavaScriptCore/parser/ParserTokens.h_sec2
43	// U = unary operator flag	43	// U = unary operator flag
44	//	44	//
45	// We must keep the upper 8bit (1byte) region empty. JSTokenType must be 24bits.	45	// We must keep the upper 8bit (1byte) region empty. JSTokenType must be 24bits.
46	UnaryOpTokenFlag = 128,	46	UnaryOpTokenFlag = 1 << 8,
47	KeywordTokenFlag = 256,	47	KeywordTokenFlag = 1 << 9,
48	BinaryOpTokenPrecedenceShift = 9,	48	BinaryOpTokenPrecedenceShift = 10,
49	BinaryOpTokenAllowsInPrecedenceAdditionalShift = 4,	49	BinaryOpTokenAllowsInPrecedenceAdditionalShift = 4,
50	BinaryOpTokenPrecedenceMask = 15 << BinaryOpTokenPrecedenceShift,	50	BinaryOpTokenPrecedenceMask = 15 << BinaryOpTokenPrecedenceShift,
51	ErrorTokenFlag = 1 << (BinaryOpTokenAllowsInPrecedenceAdditionalShift + BinaryOpTokenPrecedenceShift + 7),	51	ErrorTokenFlag = 1 << (BinaryOpTokenAllowsInPrecedenceAdditionalShift + BinaryOpTokenPrecedenceShift + 6),
52	UnterminatedErrorTokenFlag = ErrorTokenFlag << 1,	52	UnterminatedErrorTokenFlag = ErrorTokenFlag << 1,
53	RightAssociativeBinaryOpTokenFlag = UnterminatedErrorTokenFlag << 1	53	RightAssociativeBinaryOpTokenFlag = UnterminatedErrorTokenFlag << 1
54	};	54	};
Lines 192-197 enum JSTokenType { a/Source/JavaScriptCore/parser/ParserTokens.h_sec3
192	UNTERMINATED_REGEXP_LITERAL_ERRORTOK = 14 \| ErrorTokenFlag \| UnterminatedErrorTokenFlag,	192	UNTERMINATED_REGEXP_LITERAL_ERRORTOK = 14 \| ErrorTokenFlag \| UnterminatedErrorTokenFlag,
193	INVALID_TEMPLATE_LITERAL_ERRORTOK = 15 \| ErrorTokenFlag,	193	INVALID_TEMPLATE_LITERAL_ERRORTOK = 15 \| ErrorTokenFlag,
194	UNEXPECTED_ESCAPE_ERRORTOK = 16 \| ErrorTokenFlag,	194	UNEXPECTED_ESCAPE_ERRORTOK = 16 \| ErrorTokenFlag,
		195	INVALID_UNICODE_ENCODING_ERRORTOK = 17 \| ErrorTokenFlag,
		196	INVALID_IDENTIFIER_UNICODE_ERRORTOK = 18 \| ErrorTokenFlag,
195	};	197	};
196	static_assert(static_cast<unsigned>(POW) <= 0x00ffffffU, "JSTokenType must be 24bits.");	198	static_assert(static_cast<unsigned>(POW) <= 0x00ffffffU, "JSTokenType must be 24bits.");
197		199

Lines 1-3 a/Source/JavaScriptCore/ChangeLog_sec1
- a/Source/JavaScriptCore/ChangeLog +45 lines
		1	2020-03-16 Keith Miller <keith_miller@apple.com>
		2
		3	JavaScript identifier grammar supports unescaped astral symbols, but JSC doesn’t
		4	https://bugs.webkit.org/show_bug.cgi?id=208998
		5
		6	Reviewed by NOBODY (OOPS!).
		7
		8	This patch fixes a bug in the parser that allows for surrogate pairs when parsing identifiers.
		9	It also makes a few other changes to the parser:
		10
		11	1) When looking for keywords we just need to check that subsequent
		12	character cannot be a identifier part or an escape start.
		13
		14	2) The only time we call parseIdentifierSlowCase is when we hit an
		15	escape start or a surrogate pair so we can optimize that to just
		16	copy everything up slow character into our buffer.
		17
		18	3) We shouldn't allow for asking if a UChar is an identifier start/part.
		19
		20	* KeywordLookupGenerator.py:
		21	(Trie.printSubTreeAsC):
		22	(Trie.printAsC):
		23	* parser/Lexer.cpp:
		24	(JSC::isNonLatin1IdentStart):
		25	(JSC::isIdentStart):
		26	(JSC::isSingleCharacterIdentStart):
		27	(JSC::cannotBeIdentStart):
		28	(JSC::isIdentPart):
		29	(JSC::isSingleCharacterIdentPart):
		30	(JSC::cannotBeIdentPartOrEscapeStart):
		31	(JSC::Lexer<LChar>::currentCodePoint const):
		32	(JSC::Lexer<UChar>::currentCodePoint const):
		33	(JSC::Lexer<LChar>::parseIdentifier):
		34	(JSC::Lexer<UChar>::parseIdentifier):
		35	(JSC::Lexer<CharacterType>::parseIdentifierSlowCase):
		36	(JSC::Lexer<T>::lexWithoutClearingLineTerminator):
		37	(JSC::Lexer<T>::scanRegExp):
		38	(JSC::isIdentPartIncludingEscapeTemplate): Deleted.
		39	(JSC::isIdentPartIncludingEscape): Deleted.
		40	* parser/Lexer.h:
		41	(JSC::Lexer::setOffsetFromSourcePtr): Deleted.
		42	* parser/Parser.cpp:
		43	(JSC::Parser<LexerType>::printUnexpectedTokenText):
		44	* parser/ParserTokens.h:
		45
1	2020-03-13 Alexey Shvayka <shvaikalesh@gmail.com>	46	2020-03-13 Alexey Shvayka <shvaikalesh@gmail.com>
2		47
3	Bound functions should pass correct NewTarget value	48	Bound functions should pass correct NewTarget value

Lines 1-3 a/Source/WTF/ChangeLog_sec1
- a/Source/WTF/ChangeLog +11 lines
		1	2020-03-16 Keith Miller <keith_miller@apple.com>
		2
		3	JavaScript identifier grammar supports unescaped astral symbols, but JSC doesn’t
		4	https://bugs.webkit.org/show_bug.cgi?id=208998
		5
		6	Reviewed by NOBODY (OOPS!).
		7
		8	* wtf/text/WTFString.cpp:
		9	(WTF::String::fromCodePoint):
		10	* wtf/text/WTFString.h:
		11
1	2020-03-11 Jer Noble <jer.noble@apple.com>	12	2020-03-11 Jer Noble <jer.noble@apple.com>
2		13
3	Adopt AVSampleBufferVideoOutput	14	Adopt AVSampleBufferVideoOutput

Lines 1-3 a/JSTests/ChangeLog_sec1
- a/JSTests/ChangeLog +12 lines
		1	2020-03-16 Keith Miller <keith_miller@apple.com>
		2
		3	JavaScript identifier grammar supports unescaped astral symbols, but JSC doesn’t
		4	https://bugs.webkit.org/show_bug.cgi?id=208998
		5
		6	Reviewed by NOBODY (OOPS!).
		7
		8	* stress/unicode-identifiers-with-surrogate-pairs.js: Added.
		9	(let.c.of.chars.eval.foo):
		10	(throwsSyntaxError):
		11	(let.c.of.continueChars.throwsSyntaxError.foo):
		12
1	2020-03-13 Alexey Shvayka <shvaikalesh@gmail.com>	13	2020-03-13 Alexey Shvayka <shvaikalesh@gmail.com>
2		14
3	Bound functions should pass correct NewTarget value	15	Bound functions should pass correct NewTarget value

Line 0 a/JSTests/stress/unicode-identifiers-with-surrogate-pairs.js_sec1
- a/JSTests/stress/unicode-identifiers-with-surrogate-pairs.js +30 lines
	1
	2	let chars = ["鴬", "𐊧", "Ϊ"];
	3	let continueChars = [unescape("\u0311"), String.fromCharCode(...[0xDB40, 0xDD96])];
	4
	5	let o = { };
	6	for (let c of chars) {
	7	eval(`var ${c};`);
	8	eval(`function foo() { var ${c} }`);
	9	eval(`o.${c}`);
	10	}
	11
	12	function throwsSyntaxError(string) {
	13	try {
	14	eval(string);
	15	} catch (e) {
	16	if (!(e instanceof SyntaxError))
	17	throw new Error(string);
	18	return;
	19	}
	20	throw new Error(string);
	21	}
	22
	23	for (let c of continueChars) {
	24	throwsSyntaxError(`var ${c}`);
	25	throwsSyntaxError(`function foo() { var ${c} }`);
	26	throwsSyntaxError(`o.${c}`);
	27	eval(`var ${("a" + c)}`);
	28	eval(`o.${"a" + c}`);
	29
	30	}