aboutsummaryrefslogblamecommitdiff
path: root/telegram/formatter/formatter.go
blob: a8c94a0c92ba9fd4da4d755bf2fa247c846b7ef8 (plain) (tree)
1
2
3
4
5
6
7
8
9
10



                 
                 

                                        
                                            

 
                      
 






                                             
 






                                                 

                     
                            

 
                                                     
                                 
                                
 

                                    
                             

                                      
                           

                                   




                                       
 
                                                                                  

                                                                    
                                                                                                     









                                                                             
                                                         

               
                                  













                                                                       

                                            









                                                              





                                                                                
                                                             









                                                        
                                                                                                                     





























                                                                                                              
                                                                
                                                                                               


                                                                    




                                                                 
                                                                                     

                                                        

                                                                     




                                                               
                                                                         

                                                                      
                                                                             












                                                   


                                                                                  

                                              


                                                 

                                                              






























                                                                                                                                    
                                              








                                                          
                 






















                                                                                                                                                  

 





                                                                                                               

                                                 
                                                                                 

                                                                     

                                                                                     
                                           
                                                                 
                                          
                                                                       

                                                                         
                                                                                               

                                                                                     

                                                                         
                                                                                      

         
                             

 

                                                                                                              
                                                
                                     

         




                                                                               

                                                                                   
                                           
                                                                 
                                          
                                                                       

                                                                         
                                                                                               

                                                                                    





                                                                                   















                                                                            


                                                                                     


                                      
                                  




                                 





                                                                                        
 





                                                                                                                    


                                                  
                                               











                                                                                    


                                                                                           
                 

                                                                                  



                                                                           










                                                                                     
                                                                                









                                                                          
                                                                                







                                                                





                                                       
                         
 






                                                                    





                                                                              

                                       








                                                                     
package formatter

import (
	"sort"
	"unicode"

	log "github.com/sirupsen/logrus"
	"github.com/zelenin/go-tdlib/client"
)

type insertionType int

const (
	insertionOpening insertionType = iota
	insertionClosing
	insertionUnpaired
)

type MarkupModeType int

const (
	MarkupModeXEP0393 MarkupModeType = iota
	MarkupModeMarkdown
)

// insertion is a piece of text in given position
type insertion struct {
	Offset int32
	Runes  []rune
	Type   insertionType
}

// insertionStack contains the sequence of insertions
// from the start or from the end
type insertionStack []*insertion

var boldRunesMarkdown = []rune("**")
var boldRunesXEP0393 = []rune("*")
var italicRunes = []rune("_")
var strikeRunesMarkdown = []rune("~~")
var strikeRunesXEP0393 = []rune("~")
var codeRunes = []rune("`")
var preRunesStart = []rune("```\n")
var preRunesEnd = []rune("\n```")
var quoteRunes = []rune("> ")
var newlineRunes = []rune("\n")
var doubleNewlineRunes = []rune("\n\n")
var newlineCode = rune(0x0000000a)
var bmpCeil = rune(0x0000ffff)

// rebalance pumps all the values until the given offset to current stack (growing
// from start) from given stack (growing from end); should be called
// before any insertions to the current stack at the given offset
func (s insertionStack) rebalance(s2 insertionStack, offset int32) (insertionStack, insertionStack) {
	for len(s2) > 0 && s2[len(s2)-1].Offset <= offset {
		s = append(s, s2[len(s2)-1])
		s2 = s2[:len(s2)-1]
	}

	return s, s2
}

// NewIterator is a second order function that sequentially scans and returns
// stack elements; starts returning nil when elements are ended
func (s insertionStack) NewIterator() func() *insertion {
	i := -1

	return func() *insertion {
		i++
		if i < len(s) {
			return s[i]
		}
		return nil
	}
}

// SortEntities arranges the entities in traversal-ready order
func SortEntities(entities []*client.TextEntity) []*client.TextEntity {
	sortedEntities := make([]*client.TextEntity, len(entities))
	copy(sortedEntities, entities)

	sort.Slice(sortedEntities, func(i int, j int) bool {
		entity1 := sortedEntities[i]
		entity2 := sortedEntities[j]
		if entity1.Offset < entity2.Offset {
			return true
		} else if entity1.Offset == entity2.Offset {
			return entity1.Length > entity2.Length
		}
		return false
	})
	return sortedEntities
}

// MergeAdjacentEntities merges entities of a same kind
func MergeAdjacentEntities(entities []*client.TextEntity) []*client.TextEntity {
	mergedEntities := make([]*client.TextEntity, 0, len(entities))
	excludedIndices := make(map[int]bool)

	for i, entity := range entities {
		if excludedIndices[i] || entity.Type == nil {
			continue
		}

		typ := entity.Type.TextEntityTypeType()
		start := entity.Offset
		end := start + entity.Length
		ei := make(map[int]bool)

		// collect continuations
		for j, entity2 := range entities[i+1:] {
			if entity2.Type != nil && entity2.Type.TextEntityTypeType() == typ && entity2.Offset == end {
				end += entity2.Length
				ei[j+i+1] = true
			}
		}

		// check for intersections with other entities
		var isIntersecting bool
		if len(ei) > 0 {
			for _, entity2 := range entities {
				entity2End := entity2.Offset + entity2.Length
				if (entity2.Offset < start && entity2End > start && entity2End < end) ||
					(entity2.Offset > start && entity2.Offset < end && entity2End > end) {
					isIntersecting = true
					break
				}
			}
		}

		if !isIntersecting {
			entity.Length = end - start
			for j := range ei {
				excludedIndices[j] = true
			}
		}
		mergedEntities = append(mergedEntities, entity)
	}

	return mergedEntities
}

// ClaspDirectives to the following span as required by XEP-0393
func ClaspDirectives(doubledRunes []rune, entities []*client.TextEntity) []*client.TextEntity {
	alignedEntities := make([]*client.TextEntity, len(entities))
	copy(alignedEntities, entities)

	for i, entity := range alignedEntities {
		var dirty bool
		endOffset := entity.Offset + entity.Length

		if unicode.IsSpace(doubledRunes[entity.Offset]) {
			for j, r := range doubledRunes[entity.Offset+1 : endOffset] {
				if !unicode.IsSpace(r) {
					dirty = true
					entity.Offset += int32(j + 1)
					entity.Length -= int32(j + 1)
					break
				}
			}
		}
		if unicode.IsSpace(doubledRunes[endOffset-1]) {
			for j := endOffset - 2; j >= entity.Offset; j-- {
				if !unicode.IsSpace(doubledRunes[j]) {
					dirty = true
					entity.Length = j + 1 - entity.Offset
					break
				}
			}
		}

		if dirty {
			alignedEntities[i] = entity
		}
	}

	return alignedEntities
}

func markupBraces(entity *client.TextEntity, lbrace, rbrace []rune) []*insertion {
	return []*insertion{
		&insertion{
			Offset: entity.Offset,
			Runes:  lbrace,
			Type:   insertionOpening,
		},
		&insertion{
			Offset: entity.Offset + entity.Length,
			Runes:  rbrace,
			Type:   insertionClosing,
		},
	}
}

func quotePrependNewlines(entity *client.TextEntity, doubledRunes []rune, markupMode MarkupModeType) []*insertion {
	if len(doubledRunes) == 0 {
		return []*insertion{}
	}

	startRunes := []rune("\n> ")
	if entity.Offset == 0 || doubledRunes[entity.Offset-1] == newlineCode {
		startRunes = quoteRunes
	}
	insertions := []*insertion{
		&insertion{
			Offset: entity.Offset,
			Runes:  startRunes,
			Type:   insertionUnpaired,
		},
	}

	entityEnd := entity.Offset + entity.Length
	entityEndInt := int(entityEnd)

	var wasNewline bool
	// last newline is omitted, there's no need to put quote mark after the quote
	for i := entity.Offset; i < entityEnd-1; i++ {
		isNewline := doubledRunes[i] == newlineCode
		if (isNewline && markupMode == MarkupModeXEP0393) || (wasNewline && isNewline && markupMode == MarkupModeMarkdown) {
			insertions = append(insertions, &insertion{
				Offset: i + 1,
				Runes:  quoteRunes,
				Type:   insertionUnpaired,
			})
		}

		if isNewline {
			wasNewline = true
		} else {
			wasNewline = false
		}
	}

	var rbrace []rune
	if len(doubledRunes) > entityEndInt {
		if doubledRunes[entityEnd] == newlineCode {
			if markupMode == MarkupModeMarkdown && len(doubledRunes) > entityEndInt+1 && doubledRunes[entityEndInt+1] != newlineCode {
				rbrace = newlineRunes
			}
		} else {
			if markupMode == MarkupModeMarkdown {
				rbrace = doubleNewlineRunes
			} else {
				rbrace = newlineRunes
			}
		}
	}
	insertions = append(insertions, &insertion{
		Offset: entityEnd,
		Runes:  rbrace,
		Type:   insertionClosing,
	})

	return insertions
}

// entityToMarkdown generates the wrapping Markdown tags
func entityToMarkdown(entity *client.TextEntity, doubledRunes []rune, markupMode MarkupModeType) []*insertion {
	if entity == nil || entity.Type == nil {
		return []*insertion{}
	}

	switch entity.Type.TextEntityTypeType() {
	case client.TypeTextEntityTypeBold:
		return markupBraces(entity, boldRunesMarkdown, boldRunesMarkdown)
	case client.TypeTextEntityTypeItalic:
		return markupBraces(entity, italicRunes, italicRunes)
	case client.TypeTextEntityTypeStrikethrough:
		return markupBraces(entity, strikeRunesMarkdown, strikeRunesMarkdown)
	case client.TypeTextEntityTypeCode:
		return markupBraces(entity, codeRunes, codeRunes)
	case client.TypeTextEntityTypePre:
		return markupBraces(entity, preRunesStart, preRunesEnd)
	case client.TypeTextEntityTypePreCode:
		preCode, _ := entity.Type.(*client.TextEntityTypePreCode)
		return markupBraces(entity, []rune("\n```"+preCode.Language+"\n"), preRunesEnd)
	case client.TypeTextEntityTypeBlockQuote:
		return quotePrependNewlines(entity, doubledRunes, MarkupModeMarkdown)
	case client.TypeTextEntityTypeTextUrl:
		textURL, _ := entity.Type.(*client.TextEntityTypeTextUrl)
		return markupBraces(entity, []rune("["), []rune("]("+textURL.Url+")"))
	}

	return []*insertion{}
}

// entityToXEP0393 generates the wrapping XEP-0393 tags
func entityToXEP0393(entity *client.TextEntity, doubledRunes []rune, markupMode MarkupModeType) []*insertion {
	if entity == nil || entity.Type == nil {
		return []*insertion{}
	}

	switch entity.Type.TextEntityTypeType() {
	case client.TypeTextEntityTypeBold:
		return markupBraces(entity, boldRunesXEP0393, boldRunesXEP0393)
	case client.TypeTextEntityTypeItalic:
		return markupBraces(entity, italicRunes, italicRunes)
	case client.TypeTextEntityTypeStrikethrough:
		return markupBraces(entity, strikeRunesXEP0393, strikeRunesXEP0393)
	case client.TypeTextEntityTypeCode:
		return markupBraces(entity, codeRunes, codeRunes)
	case client.TypeTextEntityTypePre:
		return markupBraces(entity, preRunesStart, preRunesEnd)
	case client.TypeTextEntityTypePreCode:
		preCode, _ := entity.Type.(*client.TextEntityTypePreCode)
		return markupBraces(entity, []rune("\n```"+preCode.Language+"\n"), preRunesEnd)
	case client.TypeTextEntityTypeBlockQuote:
		return quotePrependNewlines(entity, doubledRunes, MarkupModeXEP0393)
	case client.TypeTextEntityTypeTextUrl:
		textURL, _ := entity.Type.(*client.TextEntityTypeTextUrl)
		// non-standard, Pidgin-specific
		return markupBraces(entity, []rune{}, []rune(" <"+textURL.Url+">"))
	}

	return []*insertion{}
}

// transform the source text into a form with uniform runes and code points,
// by duplicating anything beyond the Basic Multilingual Plane
func textToDoubledRunes(text string) []rune {
	doubledRunes := make([]rune, 0, len(text)*2)
	for _, cp := range text {
		if cp > bmpCeil {
			doubledRunes = append(doubledRunes, cp, cp)
		} else {
			doubledRunes = append(doubledRunes, cp)
		}
	}

	return doubledRunes
}

// Format traverses an already sorted list of entities and wraps the text in a markup
func Format(
	sourceText string,
	entities []*client.TextEntity,
	markupMode MarkupModeType,
) string {
	if len(entities) == 0 {
		return sourceText
	}

	var entityToMarkup func(*client.TextEntity, []rune, MarkupModeType) []*insertion
	if markupMode == MarkupModeXEP0393 {
		entityToMarkup = entityToXEP0393
	} else {
		entityToMarkup = entityToMarkdown
	}

	doubledRunes := textToDoubledRunes(sourceText)

	mergedEntities := SortEntities(ClaspDirectives(doubledRunes, MergeAdjacentEntities(SortEntities(entities))))

	startStack := make(insertionStack, 0, len(sourceText))
	endStack := make(insertionStack, 0, len(sourceText))

	// convert entities to a stack of brackets
	var maxEndOffset int32
	for _, entity := range mergedEntities {
		log.Debugf("%#v", entity)
		if entity.Length <= 0 {
			continue
		}

		endOffset := entity.Offset + entity.Length
		if endOffset > maxEndOffset {
			maxEndOffset = endOffset
		}

		startStack, endStack = startStack.rebalance(endStack, entity.Offset)

		insertions := entityToMarkup(entity, doubledRunes, markupMode)
		if len(insertions) > 1 {
			startStack = append(startStack, insertions[0:len(insertions)-1]...)
		}
		if len(insertions) > 0 {
			endStack = append(endStack, insertions[len(insertions)-1])
		}
	}
	// flush the closing brackets that still remain in endStack
	startStack, endStack = startStack.rebalance(endStack, maxEndOffset)
	// sort unpaired insertions
	sort.SliceStable(startStack, func(i int, j int) bool {
		ins1 := startStack[i]
		ins2 := startStack[j]
		if ins1.Type == insertionUnpaired && ins2.Type == insertionUnpaired {
			return ins1.Offset < ins2.Offset
		}
		if ins1.Type == insertionUnpaired {
			if ins1.Offset == ins2.Offset {
				if ins2.Type == insertionOpening { // > **
					return true
				} else if ins2.Type == insertionClosing { // **>
					return false
				}
			} else {
				return ins1.Offset < ins2.Offset
			}
		}
		if ins2.Type == insertionUnpaired {
			if ins1.Offset == ins2.Offset {
				if ins1.Type == insertionOpening { // > **
					return false
				} else if ins1.Type == insertionClosing { // **>
					return true
				}
			} else {
				return ins1.Offset < ins2.Offset
			}
		}
		return false
	})

	// merge brackets into text
	markupRunes := make([]rune, 0, len(sourceText))

	nextInsertion := startStack.NewIterator()
	insertion := nextInsertion()
	var skipNext bool

	for i, cp := range doubledRunes {
		if skipNext {
			skipNext = false
			continue
		}

		for insertion != nil && int(insertion.Offset) <= i {
			markupRunes = append(markupRunes, insertion.Runes...)
			insertion = nextInsertion()
		}

		markupRunes = append(markupRunes, cp)
		// skip two UTF-16 code units (not points actually!) if needed
		if cp > bmpCeil {
			skipNext = true
		}
	}
	for insertion != nil {
		markupRunes = append(markupRunes, insertion.Runes...)
		insertion = nextInsertion()
	}

	return string(markupRunes)
}