aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander Kiryukhin <a.kiryukhin@mail.ru>2019-05-28 13:24:40 +0300
committerAlexander Kiryukhin <a.kiryukhin@mail.ru>2019-05-28 13:24:40 +0300
commit5c7748b15e2fcc616c89492b85f367750e327d79 (patch)
treeec477f84fef127c42c3df0db55ab9792a74d4c99
parent0911847693eac96cce52f18a6592a93e50464cf2 (diff)
Compatibility with https://github.com/caneroj1/stemmer libraryHEADmaster
-rw-r--r--README.md19
-rw-r--r--stemmer.go58
-rw-r--r--stemmer_test.go6
3 files changed, 76 insertions, 7 deletions
diff --git a/README.md b/README.md
index edd8693..6ff53d4 100644
--- a/README.md
+++ b/README.md
@@ -9,12 +9,29 @@
Этот пакет - реализация [стеммера Портера](https://ru.wikipedia.org/wiki/Стемминг#Стеммер_Портера) для русского языка на Go.
+Интерфейс совместим со стеммером https://github.com/caneroj1/stemmer
+
## Использование
-`основа := StemmerRu.StemWord("слово")`
+`основа := StemmerRu.Stem("слово")`
Преобразует слово на входе в его основу на выходе
+Так же, из библиотеки https://github.com/caneroj1/stemmer взяты следющие методы:
+
+```
+ // stem a list of words
+ stems := StemmerRu.StemMultiple(strings)
+
+ // stem a list of words in place, modifying the original slice
+ StemmerRu.StemMultipleMutate(strings)
+
+ // stem a list of words concurrently. this also stems in place, modifying
+ // the original slice.
+ // NOTE: the order of the strings is not guaranteed to be the same.
+ StemmerRu.StemConcurrent(strings)
+```
+
## Пример
```
diff --git a/stemmer.go b/stemmer.go
index f15281b..80dc592 100644
--- a/stemmer.go
+++ b/stemmer.go
@@ -1,6 +1,7 @@
package StemmerRu
import (
+ "runtime"
"strings"
)
@@ -17,7 +18,7 @@ var (
vowels = `аеиоуыэюя`
)
-func StemWord(word string) string {
+func Stem(word string) string {
word = strings.Replace(word, `ё`, `е`, -1)
@@ -27,10 +28,9 @@ func StemWord(word string) string {
return word
}
-
R1pos := getRNPart(word, 0)
R2pos := getRNPart(word, R1pos)
- if (R2pos < RVpos) {
+ if R2pos < RVpos {
R2pos = 0
} else {
R2pos -= RVpos
@@ -140,3 +140,55 @@ func getRNPart(word string, startPos int) int {
return startPos
}
+
+// Code from https://github.com/caneroj1/stemmer
+
+// StemMultiple accepts a slice of strings and stems each of them.
+func StemMultiple(words []string) (output []string) {
+ output = make([]string, len(words))
+ for idx, word := range words {
+ output[idx] = Stem(word)
+ }
+
+ return
+}
+
+// StemMultipleMutate accepts a pointer to a slice of strings and stems them in place.
+// It modifies the original slice.
+func StemMultipleMutate(words *[]string) {
+ for idx, word := range *words {
+ (*words)[idx] = Stem(word)
+ }
+}
+
+// StemConcurrent accepts a pointer to a slice of strings and stems them in place.
+// It tries to offload the work into multiple threads. It makes no guarantees about
+// the order of the stems in the modified slice.
+func StemConcurrent(words *[]string) {
+ CPUs := runtime.NumCPU()
+ length := len(*words)
+ output := make(chan string)
+ partition := length / CPUs
+
+ var CPU int
+ for CPU = 0; CPU < CPUs; CPU++ {
+ go func(strs []string) {
+ for _, word := range strs {
+ output <- Stem(word)
+ }
+ }((*words)[CPU*partition : (CPU+1)*partition])
+ }
+
+ // if there are leftover words, stem them now
+ if length-(CPU)*partition > 0 {
+ go func(strs []string) {
+ for _, word := range strs {
+ output <- Stem(word)
+ }
+ }((*words)[(CPU)*partition : length])
+ }
+
+ for idx := 0; idx < length; idx++ {
+ (*words)[idx] = <-output
+ }
+}
diff --git a/stemmer_test.go b/stemmer_test.go
index 8b27774..334a329 100644
--- a/stemmer_test.go
+++ b/stemmer_test.go
@@ -1,9 +1,9 @@
package StemmerRu
import (
- "testing"
- "io/ioutil"
"encoding/json"
+ "io/ioutil"
+ "testing"
)
var testFile = `tests.json`
@@ -19,7 +19,7 @@ func TestStemWord(t *testing.T) {
t.Error("Can't parse json", err)
}
for source, expected := range *tests {
- result := StemWord(source);
+ result := Stem(source)
if expected != result {
t.Errorf(`Expected "%s" (source: %s) but got "%s"`, result, source, result)
}