From 70383bee128b11451211e514200b23f47bf272db Mon Sep 17 00:00:00 2001
From: bodqhrohro <bodqhrohro@gmail.com>
Date: Thu, 9 Jan 2020 23:16:40 +0200
Subject: Convert formatting entities to Markdown

---
 telegram/formatter/formatter.go      | 165 +++++++++++++++++++++++++++
 telegram/formatter/formatter_test.go | 208 +++++++++++++++++++++++++++++++++++
 telegram/utils.go                    |  44 ++++++--
 3 files changed, 410 insertions(+), 7 deletions(-)
 create mode 100644 telegram/formatter/formatter.go
 create mode 100644 telegram/formatter/formatter_test.go

(limited to 'telegram')

diff --git a/telegram/formatter/formatter.go b/telegram/formatter/formatter.go
new file mode 100644
index 0000000..4b26f83
--- /dev/null
+++ b/telegram/formatter/formatter.go
@@ -0,0 +1,165 @@
+package formatter
+
+import (
+	"sort"
+
+	log "github.com/sirupsen/logrus"
+	"github.com/zelenin/go-tdlib/client"
+)
+
+// Insertion is a piece of text in given position
+type Insertion struct {
+	Offset int32
+	Runes  []rune
+}
+
+// InsertionStack contains the sequence of insertions
+// from the start or from the end
+type InsertionStack []*Insertion
+
+var boldRunes = []rune("**")
+var italicRunes = []rune("_")
+var codeRunes = []rune("\n```\n")
+var urlRuneL = []rune("[")
+
+// rebalance pumps all the values at given offset to current stack (growing
+// from start) from given stack (growing from end); should be called
+// before any insertions to the current stack at the given offset
+func (s InsertionStack) rebalance(s2 InsertionStack, offset int32) (InsertionStack, InsertionStack) {
+	for len(s2) > 0 && s2[len(s2)-1].Offset <= offset {
+		s = append(s, s2[len(s2)-1])
+		s2 = s2[:len(s2)-1]
+	}
+
+	return s, s2
+}
+
+// NewIterator is a second order function that sequentially scans and returns
+// stack elements; starts returning nil when elements are ended
+func (s InsertionStack) NewIterator() func() *Insertion {
+	i := -1
+
+	return func() *Insertion {
+		i++
+		if i < len(s) {
+			return s[i]
+		}
+		return nil
+	}
+}
+
+// SortEntities arranges the entities in traversal-ready order
+func SortEntities(entities []*client.TextEntity) []*client.TextEntity {
+	sortedEntities := make([]*client.TextEntity, len(entities))
+	copy(sortedEntities, entities)
+
+	sort.Slice(sortedEntities, func(i int, j int) bool {
+		entity1 := entities[i]
+		entity2 := entities[j]
+		if entity1.Offset < entity2.Offset {
+			return true
+		} else if entity1.Offset == entity2.Offset {
+			return entity1.Length > entity2.Length
+		}
+		return false
+	})
+	return sortedEntities
+}
+
+func markupBraces(entity *client.TextEntity, lbrace, rbrace []rune) (*Insertion, *Insertion) {
+	return &Insertion{
+			Offset: entity.Offset,
+			Runes:  lbrace,
+		}, &Insertion{
+			Offset: entity.Offset + entity.Length,
+			Runes:  rbrace,
+		}
+}
+
+// EntityToMarkdown generates the wrapping Markdown tags
+func EntityToMarkdown(entity *client.TextEntity) (*Insertion, *Insertion) {
+	switch entity.Type.TextEntityTypeType() {
+	case client.TypeTextEntityTypeBold:
+		return markupBraces(entity, boldRunes, boldRunes)
+	case client.TypeTextEntityTypeItalic:
+		return markupBraces(entity, italicRunes, italicRunes)
+	case client.TypeTextEntityTypeCode, client.TypeTextEntityTypePre:
+		return markupBraces(entity, codeRunes, codeRunes)
+	case client.TypeTextEntityTypePreCode:
+		preCode, _ := entity.Type.(*client.TextEntityTypePreCode)
+		return markupBraces(entity, []rune("\n```"+preCode.Language+"\n"), codeRunes)
+	case client.TypeTextEntityTypeTextUrl:
+		textURL, _ := entity.Type.(*client.TextEntityTypeTextUrl)
+		return markupBraces(entity, urlRuneL, []rune("]("+textURL.Url+")"))
+	}
+
+	return nil, nil
+}
+
+// Format traverses an already sorted list of entities and wraps the text in Markdown
+func Format(
+	sourceText string,
+	entities []*client.TextEntity,
+	entityToMarkup func(*client.TextEntity) (*Insertion, *Insertion),
+) string {
+	if len(entities) == 0 {
+		return sourceText
+	}
+
+	startStack := make(InsertionStack, 0, len(sourceText))
+	endStack := make(InsertionStack, 0, len(sourceText))
+
+	// convert entities to a stack of brackets
+	var maxEndOffset int32
+	for _, entity := range entities {
+		log.Debugf("%#v", entity)
+		if entity.Length <= 0 {
+			continue
+		}
+
+		endOffset := entity.Offset + entity.Length
+		if endOffset > maxEndOffset {
+			maxEndOffset = endOffset
+		}
+
+		startStack, endStack = startStack.rebalance(endStack, entity.Offset)
+
+		startInsertion, endInsertion := entityToMarkup(entity)
+		if startInsertion != nil {
+			startStack = append(startStack, startInsertion)
+		}
+		if endInsertion != nil {
+			endStack = append(endStack, endInsertion)
+		}
+	}
+	// flush the closing brackets that still remain in endStack
+	startStack, endStack = startStack.rebalance(endStack, maxEndOffset)
+
+	// merge brackets into text
+	markupRunes := make([]rune, 0, len(sourceText))
+
+	nextInsertion := startStack.NewIterator()
+	insertion := nextInsertion()
+	var runeI int32
+
+	for _, cp := range sourceText {
+		for insertion != nil && insertion.Offset <= runeI {
+			markupRunes = append(markupRunes, insertion.Runes...)
+			insertion = nextInsertion()
+		}
+
+		markupRunes = append(markupRunes, cp)
+		// skip two UTF-16 code units (not points actually!) if needed
+		if cp > 0x0000ffff {
+			runeI += 2
+		} else {
+			runeI++
+		}
+	}
+	for insertion != nil {
+		markupRunes = append(markupRunes, insertion.Runes...)
+		insertion = nextInsertion()
+	}
+
+	return string(markupRunes)
+}
diff --git a/telegram/formatter/formatter_test.go b/telegram/formatter/formatter_test.go
new file mode 100644
index 0000000..63337d6
--- /dev/null
+++ b/telegram/formatter/formatter_test.go
@@ -0,0 +1,208 @@
+package formatter
+
+import (
+	"testing"
+
+	"github.com/zelenin/go-tdlib/client"
+)
+
+func TestNoFormatting(t *testing.T) {
+	markup := Format("abc\ndef", []*client.TextEntity{}, EntityToMarkdown)
+	if markup != "abc\ndef" {
+		t.Errorf("No formatting expected, but: %v", markup)
+	}
+}
+
+func TestFormattingSimple(t *testing.T) {
+	markup := Format("👙🐧🐖", []*client.TextEntity{
+		&client.TextEntity{
+			Offset: 2,
+			Length: 4,
+			Type:   &client.TextEntityTypeBold{},
+		},
+	}, EntityToMarkdown)
+	if markup != "👙**🐧🐖**" {
+		t.Errorf("Wrong simple formatting: %v", markup)
+	}
+}
+
+func TestFormattingAdjacent(t *testing.T) {
+	markup := Format("a👙🐧🐖", []*client.TextEntity{
+		&client.TextEntity{
+			Offset: 3,
+			Length: 2,
+			Type:   &client.TextEntityTypeItalic{},
+		},
+		&client.TextEntity{
+			Offset: 5,
+			Length: 2,
+			Type: &client.TextEntityTypeTextUrl{
+				Url: "https://narayana.im/",
+			},
+		},
+	}, EntityToMarkdown)
+	if markup != "a👙_🐧_[🐖](https://narayana.im/)" {
+		t.Errorf("Wrong adjacent formatting: %v", markup)
+	}
+}
+
+func TestFormattingAdjacentAndNested(t *testing.T) {
+	markup := Format("👙🐧🐖", []*client.TextEntity{
+		&client.TextEntity{
+			Offset: 0,
+			Length: 4,
+			Type:   &client.TextEntityTypePre{},
+		},
+		&client.TextEntity{
+			Offset: 0,
+			Length: 2,
+			Type:   &client.TextEntityTypeBold{},
+		},
+		&client.TextEntity{
+			Offset: 4,
+			Length: 2,
+			Type:   &client.TextEntityTypeItalic{},
+		},
+	}, EntityToMarkdown)
+	if markup != "\n```\n**👙**🐧\n```\n_🐖_" {
+		t.Errorf("Wrong adjacent&nested formatting: %v", markup)
+	}
+}
+
+func TestRebalanceTwoZero(t *testing.T) {
+	s1 := InsertionStack{
+		&Insertion{Offset: 7},
+		&Insertion{Offset: 8},
+	}
+	s2 := InsertionStack{}
+	s1, s2 = s1.rebalance(s2, 7)
+	if !(len(s1) == 2 && len(s2) == 0 && s1[0].Offset == 7 && s1[1].Offset == 8) {
+		t.Errorf("Wrong rebalance 2–0: %#v %#v", s1, s2)
+	}
+}
+
+func TestRebalanceNeeded(t *testing.T) {
+	s1 := InsertionStack{
+		&Insertion{Offset: 7},
+		&Insertion{Offset: 8},
+	}
+	s2 := InsertionStack{
+		&Insertion{Offset: 10},
+		&Insertion{Offset: 9},
+	}
+	s1, s2 = s1.rebalance(s2, 9)
+	if !(len(s1) == 3 && len(s2) == 1 &&
+		s1[0].Offset == 7 && s1[1].Offset == 8 && s1[2].Offset == 9 &&
+		s2[0].Offset == 10) {
+		t.Errorf("Wrong rebalance when needed: %#v %#v", s1, s2)
+	}
+}
+
+func TestRebalanceNotNeeded(t *testing.T) {
+	s1 := InsertionStack{
+		&Insertion{Offset: 7},
+		&Insertion{Offset: 8},
+	}
+	s2 := InsertionStack{
+		&Insertion{Offset: 10},
+		&Insertion{Offset: 9},
+	}
+	s1, s2 = s1.rebalance(s2, 8)
+	if !(len(s1) == 2 && len(s2) == 2 &&
+		s1[0].Offset == 7 && s1[1].Offset == 8 &&
+		s2[0].Offset == 10 && s2[1].Offset == 9) {
+		t.Errorf("Wrong rebalance when not needed: %#v %#v", s1, s2)
+	}
+}
+
+func TestRebalanceLate(t *testing.T) {
+	s1 := InsertionStack{
+		&Insertion{Offset: 7},
+		&Insertion{Offset: 8},
+	}
+	s2 := InsertionStack{
+		&Insertion{Offset: 10},
+		&Insertion{Offset: 9},
+	}
+	s1, s2 = s1.rebalance(s2, 10)
+	if !(len(s1) == 4 && len(s2) == 0 &&
+		s1[0].Offset == 7 && s1[1].Offset == 8 &&
+		s1[2].Offset == 9 && s1[3].Offset == 10) {
+		t.Errorf("Wrong rebalance when late: %#v %#v", s1, s2)
+	}
+}
+
+func TestIteratorEmpty(t *testing.T) {
+	s := InsertionStack{}
+	g := s.NewIterator()
+	v := g()
+	if v != nil {
+		t.Errorf("Empty iterator should return nil but returned %#v", v)
+	}
+}
+
+func TestIterator(t *testing.T) {
+	s := InsertionStack{
+		&Insertion{Offset: 7},
+		&Insertion{Offset: 8},
+	}
+	g := s.NewIterator()
+	v := g()
+	if v == nil || v.Offset != 7 {
+		t.Errorf("Wrong insertion instead of 7: %#v", v)
+	}
+	v = g()
+	if v == nil || v.Offset != 8 {
+		t.Errorf("Wrong insertion instead of 8: %#v", v)
+	}
+	v = g()
+	if v != nil {
+		t.Errorf("nil should be returned after end, %#v instead", v)
+	}
+	v = g()
+	if v != nil {
+		t.Errorf("Further attempts should return nil too, %#v instead", v)
+	}
+}
+
+func TestSortEntities(t *testing.T) {
+	entities := []*client.TextEntity{
+		&client.TextEntity{
+			Offset: 3,
+			Length: 2,
+		},
+		&client.TextEntity{
+			Offset: 5,
+			Length: 2,
+		},
+		&client.TextEntity{
+			Offset: 7,
+			Length: 2,
+		},
+		&client.TextEntity{
+			Offset: 6,
+			Length: 1,
+		},
+		&client.TextEntity{
+			Offset: 5,
+			Length: 1,
+		},
+	}
+	entities = SortEntities(entities)
+	if !(len(entities) == 5 &&
+		entities[0].Offset == 3 && entities[0].Length == 2 &&
+		entities[1].Offset == 5 && entities[1].Length == 2 &&
+		entities[2].Offset == 5 && entities[2].Length == 1 &&
+		entities[3].Offset == 6 && entities[3].Length == 1 &&
+		entities[4].Offset == 7 && entities[4].Length == 2) {
+		t.Errorf("Wrong sorting order: %#v", entities)
+	}
+}
+
+func TestSortEmpty(t *testing.T) {
+	entities := []*client.TextEntity{}
+	entities = SortEntities(entities)
+	if len(entities) != 0 {
+		t.Errorf("Empty entities set sorting error: %#v", entities)
+	}
+}
diff --git a/telegram/utils.go b/telegram/utils.go
index 8de1f5f..f7e7a28 100644
--- a/telegram/utils.go
+++ b/telegram/utils.go
@@ -15,6 +15,7 @@ import (
 	"time"
 
 	"dev.narayana.im/narayana/telegabber/telegram/cache"
+	"dev.narayana.im/narayana/telegabber/telegram/formatter"
 	"dev.narayana.im/narayana/telegabber/xmpp/gateway"
 
 	log "github.com/sirupsen/logrus"
@@ -281,6 +282,7 @@ func (c *Client) formatContent(file *client.File, filename string) string {
 }
 
 func (c *Client) messageToText(message *client.Message) string {
+	markupFunction := formatter.EntityToMarkdown
 	switch message.Content.MessageContentType() {
 	case client.TypeMessageSticker:
 		sticker, _ := message.Content.(*client.MessageSticker)
@@ -318,27 +320,55 @@ func (c *Client) messageToText(message *client.Message) string {
 		)
 	case client.TypeMessagePhoto:
 		photo, _ := message.Content.(*client.MessagePhoto)
-		return photo.Caption.Text
+		return formatter.Format(
+			photo.Caption.Text,
+			formatter.SortEntities(photo.Caption.Entities),
+			markupFunction,
+		)
 	case client.TypeMessageAudio:
 		audio, _ := message.Content.(*client.MessageAudio)
-		return audio.Caption.Text
+		return formatter.Format(
+			audio.Caption.Text,
+			formatter.SortEntities(audio.Caption.Entities),
+			markupFunction,
+		)
 	case client.TypeMessageVideo:
 		video, _ := message.Content.(*client.MessageVideo)
-		return video.Caption.Text
+		return formatter.Format(
+			video.Caption.Text,
+			formatter.SortEntities(video.Caption.Entities),
+			markupFunction,
+		)
 	case client.TypeMessageDocument:
 		document, _ := message.Content.(*client.MessageDocument)
-		return document.Caption.Text
+		return formatter.Format(
+			document.Caption.Text,
+			formatter.SortEntities(document.Caption.Entities),
+			markupFunction,
+		)
 	case client.TypeMessageText:
 		text, _ := message.Content.(*client.MessageText)
-		return text.Text.Text
+		return formatter.Format(
+			text.Text.Text,
+			formatter.SortEntities(text.Text.Entities),
+			markupFunction,
+		)
 	case client.TypeMessageVoiceNote:
 		voice, _ := message.Content.(*client.MessageVoiceNote)
-		return voice.Caption.Text
+		return formatter.Format(
+			voice.Caption.Text,
+			formatter.SortEntities(voice.Caption.Entities),
+			markupFunction,
+		)
 	case client.TypeMessageVideoNote:
 		return ""
 	case client.TypeMessageAnimation:
 		animation, _ := message.Content.(*client.MessageAnimation)
-		return animation.Caption.Text
+		return formatter.Format(
+			animation.Caption.Text,
+			formatter.SortEntities(animation.Caption.Entities),
+			markupFunction,
+		)
 	}
 
 	return fmt.Sprintf("unknown message (%s)", message.Content.MessageContentType())
-- 
cgit v1.2.3