mirror of https://github.com/ghostfolio/ghostfolio
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
164 lines
6.9 KiB
164 lines
6.9 KiB
/******************************************************************************
|
|
* Copyright 2021 TypeFox GmbH
|
|
* This program and the accompanying materials are made available under the
|
|
* terms of the MIT License, which is available in the project root.
|
|
******************************************************************************/
|
|
|
|
import type { CustomPatternMatcherFunc, ILexingError, TokenPattern, TokenType, TokenVocabulary } from 'chevrotain';
|
|
import type { AbstractRule, Grammar, Keyword, TerminalRule } from '../languages/generated/ast.js';
|
|
import type { Stream } from '../utils/stream.js';
|
|
import { Lexer } from 'chevrotain';
|
|
import { isKeyword, isParserRule, isTerminalRule } from '../languages/generated/ast.js';
|
|
import { streamAllContents } from '../utils/ast-utils.js';
|
|
import { getAllReachableRules, terminalRegex } from '../utils/grammar-utils.js';
|
|
import { getCaseInsensitivePattern, isWhitespace, partialMatches } from '../utils/regexp-utils.js';
|
|
import { stream } from '../utils/stream.js';
|
|
|
|
export interface TokenBuilderOptions {
|
|
caseInsensitive?: boolean
|
|
}
|
|
|
|
export interface TokenBuilder {
|
|
buildTokens(grammar: Grammar, options?: TokenBuilderOptions): TokenVocabulary;
|
|
/**
|
|
* Produces a lexing report for the given text that was just tokenized using the tokens provided by this builder.
|
|
*
|
|
* @param text The text that was tokenized.
|
|
*/
|
|
flushLexingReport?(text: string): LexingReport;
|
|
}
|
|
|
|
/**
|
|
* A custom lexing report that can be produced by the token builder during the lexing process.
|
|
* Adopters need to ensure that the any custom fields are serializable so they can be sent across worker threads.
|
|
*/
|
|
export interface LexingReport {
|
|
diagnostics: LexingDiagnostic[];
|
|
}
|
|
|
|
export type LexingDiagnosticSeverity = 'error' | 'warning' | 'info' | 'hint';
|
|
|
|
export interface LexingDiagnostic extends ILexingError {
|
|
severity?: LexingDiagnosticSeverity;
|
|
}
|
|
|
|
export class DefaultTokenBuilder implements TokenBuilder {
|
|
/**
|
|
* The list of diagnostics stored during the lexing process of a single text.
|
|
*/
|
|
protected diagnostics: LexingDiagnostic[] = [];
|
|
|
|
buildTokens(grammar: Grammar, options?: TokenBuilderOptions): TokenVocabulary {
|
|
const reachableRules = stream(getAllReachableRules(grammar, false));
|
|
const terminalTokens: TokenType[] = this.buildTerminalTokens(reachableRules);
|
|
const tokens: TokenType[] = this.buildKeywordTokens(reachableRules, terminalTokens, options);
|
|
|
|
terminalTokens.forEach(terminalToken => {
|
|
const pattern = terminalToken.PATTERN;
|
|
if (typeof pattern === 'object' && pattern && 'test' in pattern && isWhitespace(pattern)) {
|
|
tokens.unshift(terminalToken);
|
|
} else {
|
|
tokens.push(terminalToken);
|
|
}
|
|
});
|
|
// We don't need to add the EOF token explicitly.
|
|
// It is automatically available at the end of the token stream.
|
|
return tokens;
|
|
}
|
|
|
|
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
flushLexingReport(text: string): LexingReport {
|
|
return { diagnostics: this.popDiagnostics() };
|
|
}
|
|
|
|
protected popDiagnostics(): LexingDiagnostic[] {
|
|
const diagnostics = [...this.diagnostics];
|
|
this.diagnostics = [];
|
|
return diagnostics;
|
|
}
|
|
|
|
protected buildTerminalTokens(rules: Stream<AbstractRule>): TokenType[] {
|
|
return rules.filter(isTerminalRule).filter(e => !e.fragment)
|
|
.map(terminal => this.buildTerminalToken(terminal)).toArray();
|
|
}
|
|
|
|
protected buildTerminalToken(terminal: TerminalRule): TokenType {
|
|
const regex = terminalRegex(terminal);
|
|
const pattern = this.requiresCustomPattern(regex) ? this.regexPatternFunction(regex) : regex;
|
|
const tokenType: TokenType = {
|
|
name: terminal.name,
|
|
PATTERN: pattern,
|
|
};
|
|
if (typeof pattern === 'function') {
|
|
tokenType.LINE_BREAKS = true;
|
|
}
|
|
if (terminal.hidden) {
|
|
// Only skip tokens that are able to accept whitespace
|
|
tokenType.GROUP = isWhitespace(regex) ? Lexer.SKIPPED : 'hidden';
|
|
}
|
|
return tokenType;
|
|
}
|
|
|
|
protected requiresCustomPattern(regex: RegExp): boolean {
|
|
if (regex.flags.includes('u') || regex.flags.includes('s')) {
|
|
// Unicode and dotall regexes are not supported by Chevrotain.
|
|
return true;
|
|
} else if (regex.source.includes('?<=') || regex.source.includes('?<!')) {
|
|
// Negative and positive lookbehind are not supported by Chevrotain yet.
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
protected regexPatternFunction(regex: RegExp): CustomPatternMatcherFunc {
|
|
const stickyRegex = new RegExp(regex, regex.flags + 'y');
|
|
return (text, offset) => {
|
|
stickyRegex.lastIndex = offset;
|
|
const execResult = stickyRegex.exec(text);
|
|
return execResult;
|
|
};
|
|
}
|
|
|
|
protected buildKeywordTokens(rules: Stream<AbstractRule>, terminalTokens: TokenType[], options?: TokenBuilderOptions): TokenType[] {
|
|
return rules
|
|
// We filter by parser rules, since keywords in terminal rules get transformed into regex and are not actual tokens
|
|
.filter(isParserRule)
|
|
.flatMap(rule => streamAllContents(rule).filter(isKeyword))
|
|
.distinct(e => e.value).toArray()
|
|
// Sort keywords by descending length
|
|
.sort((a, b) => b.value.length - a.value.length)
|
|
.map(keyword => this.buildKeywordToken(keyword, terminalTokens, Boolean(options?.caseInsensitive)));
|
|
}
|
|
|
|
protected buildKeywordToken(keyword: Keyword, terminalTokens: TokenType[], caseInsensitive: boolean): TokenType {
|
|
const keywordPattern = this.buildKeywordPattern(keyword, caseInsensitive);
|
|
const tokenType: TokenType = {
|
|
name: keyword.value,
|
|
PATTERN: keywordPattern,
|
|
LONGER_ALT: this.findLongerAlt(keyword, terminalTokens)
|
|
};
|
|
|
|
if (typeof keywordPattern === 'function') {
|
|
tokenType.LINE_BREAKS = true;
|
|
}
|
|
|
|
return tokenType;
|
|
}
|
|
|
|
protected buildKeywordPattern(keyword: Keyword, caseInsensitive: boolean): TokenPattern {
|
|
return caseInsensitive ?
|
|
new RegExp(getCaseInsensitivePattern(keyword.value)) :
|
|
keyword.value;
|
|
}
|
|
|
|
protected findLongerAlt(keyword: Keyword, terminalTokens: TokenType[]): TokenType[] {
|
|
return terminalTokens.reduce((longerAlts: TokenType[], token) => {
|
|
const pattern = token?.PATTERN as RegExp;
|
|
if (pattern?.source && partialMatches('^' + pattern.source + '$', keyword.value)) {
|
|
longerAlts.push(token);
|
|
}
|
|
return longerAlts;
|
|
}, []);
|
|
}
|
|
}
|
|
|