| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | // Copyright 2018 The Hugo Authors. All rights reserved.
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | // Licensed under the Apache License, Version 2.0 (the "License");
 | 
					
						
							|  |  |  | // you may not use this file except in compliance with the License.
 | 
					
						
							|  |  |  | // You may obtain a copy of the License at
 | 
					
						
							|  |  |  | // http://www.apache.org/licenses/LICENSE-2.0
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | // Unless required by applicable law or agreed to in writing, software
 | 
					
						
							|  |  |  | // distributed under the License is distributed on an "AS IS" BASIS,
 | 
					
						
							|  |  |  | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
					
						
							|  |  |  | // See the License for the specific language governing permissions and
 | 
					
						
							|  |  |  | // limitations under the License.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | package pageparser
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-10-19 11:30:57 +02:00
										 |  |  | import (
 | 
					
						
							|  |  |  | 	"bytes"
 | 
					
						
							|  |  |  | 	"fmt"
 | 
					
						
							| 
									
										
										
										
											2019-09-29 14:51:51 +02:00
										 |  |  | 	"regexp"
 | 
					
						
							|  |  |  | 	"strconv"
 | 
					
						
							| 
									
										
										
										
											2022-05-28 13:18:50 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	"github.com/yuin/goldmark/util"
 | 
					
						
							| 
									
										
										
										
											2018-10-19 11:30:57 +02:00
										 |  |  | )
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-07 16:11:47 +02:00
										 |  |  | type lowHigh struct {
 | 
					
						
							|  |  |  | 	Low  int
 | 
					
						
							|  |  |  | 	High int
 | 
					
						
							|  |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | type Item struct {
 | 
					
						
							| 
									
										
										
										
											2022-07-07 16:11:47 +02:00
										 |  |  | 	Type ItemType
 | 
					
						
							|  |  |  | 	Err  error
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// The common case is a single segment.
 | 
					
						
							|  |  |  | 	low  int
 | 
					
						
							|  |  |  | 	high int
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// This is the uncommon case.
 | 
					
						
							|  |  |  | 	segments []lowHigh
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// Used for validation.
 | 
					
						
							|  |  |  | 	firstByte byte
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-09-29 14:51:51 +02:00
										 |  |  | 	isString bool
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-10-18 10:21:23 +02:00
										 |  |  | type Items []Item
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-07 16:11:47 +02:00
										 |  |  | func (i Item) Pos() int {
 | 
					
						
							|  |  |  | 	if len(i.segments) > 0 {
 | 
					
						
							|  |  |  | 		return i.segments[0].Low
 | 
					
						
							|  |  |  | 	}
 | 
					
						
							|  |  |  | 	return i.low
 | 
					
						
							|  |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (i Item) Val(source []byte) []byte {
 | 
					
						
							|  |  |  | 	if len(i.segments) == 0 {
 | 
					
						
							|  |  |  | 		return source[i.low:i.high]
 | 
					
						
							|  |  |  | 	}
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if len(i.segments) == 1 {
 | 
					
						
							|  |  |  | 		return source[i.segments[0].Low:i.segments[0].High]
 | 
					
						
							|  |  |  | 	}
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	var b bytes.Buffer
 | 
					
						
							|  |  |  | 	for _, s := range i.segments {
 | 
					
						
							|  |  |  | 		b.Write(source[s.Low:s.High])
 | 
					
						
							|  |  |  | 	}
 | 
					
						
							|  |  |  | 	return b.Bytes()
 | 
					
						
							|  |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (i Item) ValStr(source []byte) string {
 | 
					
						
							|  |  |  | 	return string(i.Val(source))
 | 
					
						
							| 
									
										
										
										
											2018-10-18 09:47:39 +02:00
										 |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-07 16:11:47 +02:00
										 |  |  | func (i Item) ValTyped(source []byte) any {
 | 
					
						
							|  |  |  | 	str := i.ValStr(source)
 | 
					
						
							| 
									
										
										
										
											2019-09-29 14:51:51 +02:00
										 |  |  | 	if i.isString {
 | 
					
						
							|  |  |  | 		// A quoted value that is a string even if it looks like a number etc.
 | 
					
						
							|  |  |  | 		return str
 | 
					
						
							|  |  |  | 	}
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if boolRe.MatchString(str) {
 | 
					
						
							|  |  |  | 		return str == "true"
 | 
					
						
							|  |  |  | 	}
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if intRe.MatchString(str) {
 | 
					
						
							|  |  |  | 		num, err := strconv.Atoi(str)
 | 
					
						
							|  |  |  | 		if err != nil {
 | 
					
						
							|  |  |  | 			return str
 | 
					
						
							|  |  |  | 		}
 | 
					
						
							|  |  |  | 		return num
 | 
					
						
							|  |  |  | 	}
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if floatRe.MatchString(str) {
 | 
					
						
							|  |  |  | 		num, err := strconv.ParseFloat(str, 64)
 | 
					
						
							|  |  |  | 		if err != nil {
 | 
					
						
							|  |  |  | 			return str
 | 
					
						
							|  |  |  | 		}
 | 
					
						
							|  |  |  | 		return num
 | 
					
						
							|  |  |  | 	}
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	return str
 | 
					
						
							|  |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | func (i Item) IsText() bool {
 | 
					
						
							| 
									
										
										
										
											2022-05-28 13:18:50 +02:00
										 |  |  | 	return i.Type == tText || i.Type == tIndentation
 | 
					
						
							|  |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (i Item) IsIndentation() bool {
 | 
					
						
							|  |  |  | 	return i.Type == tIndentation
 | 
					
						
							| 
									
										
										
										
											2018-10-19 11:30:57 +02:00
										 |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-07 16:11:47 +02:00
										 |  |  | func (i Item) IsNonWhitespace(source []byte) bool {
 | 
					
						
							|  |  |  | 	return len(bytes.TrimSpace(i.Val(source))) > 0
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (i Item) IsShortcodeName() bool {
 | 
					
						
							| 
									
										
										
										
											2018-10-19 11:30:57 +02:00
										 |  |  | 	return i.Type == tScName
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-11-26 11:01:27 +01:00
										 |  |  | func (i Item) IsInlineShortcodeName() bool {
 | 
					
						
							|  |  |  | 	return i.Type == tScNameInline
 | 
					
						
							|  |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | func (i Item) IsLeftShortcodeDelim() bool {
 | 
					
						
							| 
									
										
										
										
											2018-10-19 11:30:57 +02:00
										 |  |  | 	return i.Type == tLeftDelimScWithMarkup || i.Type == tLeftDelimScNoMarkup
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (i Item) IsRightShortcodeDelim() bool {
 | 
					
						
							| 
									
										
										
										
											2018-10-19 11:30:57 +02:00
										 |  |  | 	return i.Type == tRightDelimScWithMarkup || i.Type == tRightDelimScNoMarkup
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (i Item) IsShortcodeClose() bool {
 | 
					
						
							| 
									
										
										
										
											2018-10-19 11:30:57 +02:00
										 |  |  | 	return i.Type == tScClose
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (i Item) IsShortcodeParam() bool {
 | 
					
						
							| 
									
										
										
										
											2018-10-19 11:30:57 +02:00
										 |  |  | 	return i.Type == tScParam
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (i Item) IsShortcodeParamVal() bool {
 | 
					
						
							| 
									
										
										
										
											2018-10-19 11:30:57 +02:00
										 |  |  | 	return i.Type == tScParamVal
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (i Item) IsShortcodeMarkupDelimiter() bool {
 | 
					
						
							| 
									
										
										
										
											2018-10-19 11:30:57 +02:00
										 |  |  | 	return i.Type == tLeftDelimScWithMarkup || i.Type == tRightDelimScWithMarkup
 | 
					
						
							| 
									
										
										
										
											2018-10-18 10:21:23 +02:00
										 |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (i Item) IsFrontMatter() bool {
 | 
					
						
							| 
									
										
										
										
											2018-10-19 11:30:57 +02:00
										 |  |  | 	return i.Type >= TypeFrontMatterYAML && i.Type <= TypeFrontMatterORG
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (i Item) IsDone() bool {
 | 
					
						
							| 
									
										
										
										
											2018-10-19 11:30:57 +02:00
										 |  |  | 	return i.Type == tError || i.Type == tEOF
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (i Item) IsEOF() bool {
 | 
					
						
							| 
									
										
										
										
											2018-10-19 11:30:57 +02:00
										 |  |  | 	return i.Type == tEOF
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (i Item) IsError() bool {
 | 
					
						
							| 
									
										
										
										
											2018-10-19 11:30:57 +02:00
										 |  |  | 	return i.Type == tError
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-07 16:11:47 +02:00
										 |  |  | func (i Item) ToString(source []byte) string {
 | 
					
						
							|  |  |  | 	val := i.Val(source)
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | 	switch {
 | 
					
						
							| 
									
										
										
										
											2018-10-19 11:30:57 +02:00
										 |  |  | 	case i.Type == tEOF:
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | 		return "EOF"
 | 
					
						
							| 
									
										
										
										
											2018-10-19 11:30:57 +02:00
										 |  |  | 	case i.Type == tError:
 | 
					
						
							| 
									
										
										
										
											2022-07-07 16:11:47 +02:00
										 |  |  | 		return string(val)
 | 
					
						
							| 
									
										
										
										
											2022-05-28 13:18:50 +02:00
										 |  |  | 	case i.Type == tIndentation:
 | 
					
						
							| 
									
										
										
										
											2022-07-07 16:11:47 +02:00
										 |  |  | 		return fmt.Sprintf("%s:[%s]", i.Type, util.VisualizeSpaces(val))
 | 
					
						
							| 
									
										
										
										
											2018-10-19 11:30:57 +02:00
										 |  |  | 	case i.Type > tKeywordMarker:
 | 
					
						
							| 
									
										
										
										
											2022-07-07 16:11:47 +02:00
										 |  |  | 		return fmt.Sprintf("<%s>", val)
 | 
					
						
							|  |  |  | 	case len(val) > 50:
 | 
					
						
							|  |  |  | 		return fmt.Sprintf("%v:%.20q...", i.Type, val)
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | 	}
 | 
					
						
							| 
									
										
										
										
											2022-07-07 16:11:47 +02:00
										 |  |  | 	return fmt.Sprintf("%v:[%s]", i.Type, val)
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-10-18 10:21:23 +02:00
										 |  |  | type ItemType int
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | const (
 | 
					
						
							| 
									
										
										
										
											2018-10-18 10:21:23 +02:00
										 |  |  | 	tError ItemType = iota
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | 	tEOF
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:48:55 +02:00
										 |  |  | 	// page items
 | 
					
						
							| 
									
										
										
										
											2018-10-23 14:37:09 +02:00
										 |  |  | 	TypeLeadSummaryDivider // <!--more-->,  # more
 | 
					
						
							| 
									
										
										
										
											2018-10-18 10:21:23 +02:00
										 |  |  | 	TypeFrontMatterYAML
 | 
					
						
							|  |  |  | 	TypeFrontMatterTOML
 | 
					
						
							|  |  |  | 	TypeFrontMatterJSON
 | 
					
						
							|  |  |  | 	TypeFrontMatterORG
 | 
					
						
							| 
									
										
											  
											
												Move the emoji parsing to pageparser
This avoids double parsing the page content when `enableEmoji=true`.
This commit also adds some general improvements to the parser, making it in general much faster:
```bash
benchmark                     old ns/op     new ns/op     delta
BenchmarkShortcodeLexer-4     90258         101730        +12.71%
BenchmarkParse-4              148940        15037         -89.90%
benchmark                     old allocs     new allocs     delta
BenchmarkShortcodeLexer-4     456            700            +53.51%
BenchmarkParse-4              28             33             +17.86%
benchmark                     old bytes     new bytes     delta
BenchmarkShortcodeLexer-4     69875         81014         +15.94%
BenchmarkParse-4              8128          8304          +2.17%
```
Running some site benchmarks with Emoji support turned on:
```bash
benchmark                                                                                     old ns/op     new ns/op     delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4     924556797     818115620     -11.51%
benchmark                                                                                     old allocs     new allocs     delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4     4112613        4133787        +0.51%
benchmark                                                                                     old bytes     new bytes     delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4     426982864     424363832     -0.61%
```
Fixes #5534
											
										 
											2018-12-17 21:03:23 +01:00
										 |  |  | 	TypeEmoji
 | 
					
						
							| 
									
										
										
										
											2018-10-18 10:21:23 +02:00
										 |  |  | 	TypeIgnore // // The BOM Unicode byte order marker and possibly others
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:48:55 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | 	// shortcode items
 | 
					
						
							|  |  |  | 	tLeftDelimScNoMarkup
 | 
					
						
							|  |  |  | 	tRightDelimScNoMarkup
 | 
					
						
							|  |  |  | 	tLeftDelimScWithMarkup
 | 
					
						
							|  |  |  | 	tRightDelimScWithMarkup
 | 
					
						
							|  |  |  | 	tScClose
 | 
					
						
							|  |  |  | 	tScName
 | 
					
						
							| 
									
										
										
										
											2018-11-26 11:01:27 +01:00
										 |  |  | 	tScNameInline
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | 	tScParam
 | 
					
						
							|  |  |  | 	tScParamVal
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-28 13:18:50 +02:00
										 |  |  | 	tIndentation
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:48:55 +02:00
										 |  |  | 	tText // plain text
 | 
					
						
							| 
									
										
										
										
											2018-10-17 13:16:45 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	// preserved for later - keywords come after this
 | 
					
						
							|  |  |  | 	tKeywordMarker
 | 
					
						
							|  |  |  | )
 | 
					
						
							| 
									
										
										
										
											2019-09-29 14:51:51 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | var (
 | 
					
						
							| 
									
										
										
										
											2022-11-17 20:00:49 -08:00
										 |  |  | 	boolRe  = regexp.MustCompile(`^(true|false)$`)
 | 
					
						
							| 
									
										
										
										
											2019-09-29 14:51:51 +02:00
										 |  |  | 	intRe   = regexp.MustCompile(`^[-+]?\d+$`)
 | 
					
						
							|  |  |  | 	floatRe = regexp.MustCompile(`^[-+]?\d*\.\d+$`)
 | 
					
						
							|  |  |  | )
 |