PO(T) Export, for message extraction

Allows stable round-tripping of PO(T) data across multiple message extraction passes.
Avoids canonicalisation of header keys, while preserving other semantics.
This commit is contained in:
Andrew Williams 2020-09-27 07:16:19 +13:00
parent 5133d3fd0b
commit 01a5eea7fa
7 changed files with 632 additions and 29 deletions

413
domain.go
View file

@ -1,10 +1,9 @@
package gotext
import (
"bufio"
"bytes"
"encoding/gob"
"net/textproto"
"sort"
"strconv"
"strings"
"sync"
@ -17,7 +16,7 @@ import (
// Domain has all the common functions for dealing with a gettext domain
// it's initialized with a GettextFile (which represents either a Po or Mo file)
type Domain struct {
Headers textproto.MIMEHeader
Headers HeaderMap
// Language header
Language string
@ -26,6 +25,9 @@ type Domain struct {
// Plural-Forms header
PluralForms string
// Preserve comments at head of PO for round-trip
headerComments []string
// Parsed Plural-Forms header values
nplurals int
plural string
@ -43,11 +45,43 @@ type Domain struct {
// Parsing buffers
trBuffer *Translation
ctxBuffer string
refBuffer string
}
// Preserve MIMEHeader behaviour, without the canonicalisation
type HeaderMap map[string][]string
func (m HeaderMap) Add(key, value string) {
m[key] = append(m[key], value)
}
func (m HeaderMap) Del(key string) {
delete(m, key)
}
func (m HeaderMap) Get(key string) string {
if m == nil {
return ""
}
v := m[key]
if len(v) == 0 {
return ""
}
return v[0]
}
func (m HeaderMap) Set(key, value string) {
m[key] = []string{value}
}
func (m HeaderMap) Values(key string) []string {
if m == nil {
return nil
}
return m[key]
}
func NewDomain() *Domain {
domain := new(Domain)
domain.Headers = make(HeaderMap)
domain.headerComments = make([]string, 0)
domain.translations = make(map[string]*Translation)
domain.contexts = make(map[string]map[string]*Translation)
domain.pluralTranslations = make(map[string]*Translation)
@ -73,28 +107,42 @@ func (do *Domain) pluralForm(n int) int {
// parseHeaders retrieves data from previously parsed headers. it's called by both Mo and Po when parsing
func (do *Domain) parseHeaders() {
// Make sure we end with 2 carriage returns.
empty := ""
if _, ok := do.translations[empty]; ok {
empty = do.translations[empty].Get()
raw := ""
if _, ok := do.translations[raw]; ok {
raw = do.translations[raw].Get()
}
raw := empty + "\n\n"
// Read
reader := bufio.NewReader(strings.NewReader(raw))
tp := textproto.NewReader(reader)
// textproto.ReadMIMEHeader() forces keys through CanonicalMIMEHeaderKey(); must read header manually to have one-to-one round-trip of keys
languageKey := "Language"
pluralFormsKey := "Plural-Forms"
var err error
rawLines := strings.Split(raw, "\n")
for _, line := range rawLines {
if len(line) == 0 {
continue
}
do.Headers, err = tp.ReadMIMEHeader()
if err != nil {
return
colonIdx := strings.Index(line, ":")
if colonIdx < 0 {
continue
}
key := line[:colonIdx]
lowerKey := strings.ToLower(key)
if lowerKey == strings.ToLower(languageKey) {
languageKey = key
} else if lowerKey == strings.ToLower(pluralFormsKey) {
pluralFormsKey = key
}
value := strings.TrimSpace(line[colonIdx+1:])
do.Headers.Add(key, value)
}
// Get/save needed headers
do.Language = do.Headers.Get("Language")
do.Language = do.Headers.Get(languageKey)
do.tag = language.Make(do.Language)
do.PluralForms = do.Headers.Get("Plural-Forms")
do.PluralForms = do.Headers.Get(pluralFormsKey)
// Parse Plural-Forms formula
if do.PluralForms == "" {
@ -126,6 +174,80 @@ func (do *Domain) parseHeaders() {
}
}
// Drops any translations stored that have not been Set*() since 'po'
// was initialised
func (do *Domain) DropStaleTranslations() {
do.trMutex.Lock()
do.pluralMutex.Lock()
defer do.trMutex.Unlock()
defer do.pluralMutex.Unlock()
for name, ctx := range do.contexts {
for id, trans := range ctx {
if trans.IsStale() {
delete(ctx, id)
}
}
if len(ctx) == 0 {
delete(do.contexts, name)
}
}
for id, trans := range do.translations {
if trans.IsStale() {
delete(do.translations, id)
}
}
}
// Set source references for a given translation
func (do *Domain) SetRefs(str string, refs []string) {
do.trMutex.Lock()
do.pluralMutex.Lock()
defer do.trMutex.Unlock()
defer do.pluralMutex.Unlock()
if trans, ok := do.translations[str]; ok {
trans.Refs = refs
} else {
trans = NewTranslation()
trans.ID = str
trans.SetRefs(refs)
do.translations[str] = trans
}
}
// Get source references for a given translation
func (do *Domain) GetRefs(str string) []string {
// Sync read
do.trMutex.RLock()
defer do.trMutex.RUnlock()
if do.translations != nil {
if trans, ok := do.translations[str]; ok {
return trans.Refs
}
}
return nil
}
// Set the translation of a given string
func (do *Domain) Set(id, str string) {
do.trMutex.Lock()
do.pluralMutex.Lock()
defer do.trMutex.Unlock()
defer do.pluralMutex.Unlock()
if trans, ok := do.translations[id]; ok {
trans.Set(str)
} else {
trans = NewTranslation()
trans.ID = id
trans.Set(str)
do.translations[str] = trans
}
}
func (do *Domain) Get(str string, vars ...interface{}) string {
// Sync read
do.trMutex.RLock()
@ -141,6 +263,27 @@ func (do *Domain) Get(str string, vars ...interface{}) string {
return Printf(str, vars...)
}
// Set the (N)th plural form for the given string
func (do *Domain) SetN(id, plural string, n int, str string) {
// Get plural form _before_ lock down
pluralForm := do.pluralForm(n)
do.trMutex.Lock()
do.pluralMutex.Lock()
defer do.trMutex.Unlock()
defer do.pluralMutex.Unlock()
if trans, ok := do.translations[id]; ok {
trans.SetN(pluralForm, str)
} else {
trans = NewTranslation()
trans.ID = id
trans.PluralID = plural
trans.SetN(pluralForm, str)
do.translations[str] = trans
}
}
// GetN retrieves the (N)th plural form of Translation for the given string.
// Supports optional parameters (vars... interface{}) to be inserted on the formatted string using the fmt.Printf syntax.
func (do *Domain) GetN(str, plural string, n int, vars ...interface{}) string {
@ -161,6 +304,32 @@ func (do *Domain) GetN(str, plural string, n int, vars ...interface{}) string {
return Printf(plural, vars...)
}
// Set the translation for the given string in the given context
func (do *Domain) SetC(id, ctx, str string) {
do.trMutex.Lock()
do.pluralMutex.Lock()
defer do.trMutex.Unlock()
defer do.pluralMutex.Unlock()
if context, ok := do.contexts[ctx]; ok {
if trans, hasTrans := context[id]; hasTrans {
trans.Set(str)
} else {
trans = NewTranslation()
trans.ID = id
trans.Set(str)
context[id] = trans
}
} else {
trans := NewTranslation()
trans.ID = id
trans.Set(str)
do.contexts[ctx] = map[string]*Translation{
id: trans,
}
}
}
// GetC retrieves the corresponding Translation for a given string in the given context.
// Supports optional parameters (vars... interface{}) to be inserted on the formatted string using the fmt.Printf syntax.
func (do *Domain) GetC(str, ctx string, vars ...interface{}) string {
@ -181,6 +350,35 @@ func (do *Domain) GetC(str, ctx string, vars ...interface{}) string {
return Printf(str, vars...)
}
// Set the (N)th plural form for the given string in the given context
func (do *Domain) SetNC(id, plural, ctx string, n int, str string) {
// Get plural form _before_ lock down
pluralForm := do.pluralForm(n)
do.trMutex.Lock()
do.pluralMutex.Lock()
defer do.trMutex.Unlock()
defer do.pluralMutex.Unlock()
if context, ok := do.contexts[ctx]; ok {
if trans, hasTrans := context[id]; hasTrans {
trans.SetN(pluralForm, str)
} else {
trans = NewTranslation()
trans.ID = id
trans.SetN(pluralForm, str)
context[id] = trans
}
} else {
trans := NewTranslation()
trans.ID = id
trans.SetN(pluralForm, str)
do.contexts[ctx] = map[string]*Translation{
id: trans,
}
}
}
// GetNC retrieves the (N)th plural form of Translation for the given string in the given context.
// Supports optional parameters (vars... interface{}) to be inserted on the formatted string using the fmt.Printf syntax.
func (do *Domain) GetNC(str, plural string, n int, ctx string, vars ...interface{}) string {
@ -203,6 +401,187 @@ func (do *Domain) GetNC(str, plural string, n int, ctx string, vars ...interface
return Printf(plural, vars...)
}
type SourceReference struct {
path string
line int
context string
trans *Translation
}
func extractPathAndLine(ref string) (string, int) {
var path string
var line int
colonIdx := strings.IndexRune(ref, ':')
if colonIdx >= 0 {
path = ref[:colonIdx]
line, _ = strconv.Atoi(ref[colonIdx+1:])
} else {
path = ref
line = 0
}
return path, line
}
// MarshalText implements encoding.TextMarshaler interface
// Assists round-trip of POT/PO content
func (do *Domain) MarshalText() ([]byte, error) {
var buf bytes.Buffer
if len(do.headerComments) > 0 {
buf.WriteString(strings.Join(do.headerComments, "\n"))
buf.WriteByte(byte('\n'))
}
buf.WriteString("msgid \"\"\nmsgstr \"\"")
// Standard order consistent with xgettext
headerOrder := map[string]int{
"project-id-version": 0,
"report-msgid-bugs-to": 1,
"pot-creation-date": 2,
"po-revision-date": 3,
"last-translator": 4,
"language-team": 5,
"language": 6,
"mime-version": 7,
"content-type": 9,
"content-transfer-encoding": 10,
"plural-forms": 11,
}
headerKeys := make([]string, 0, len(do.Headers))
for k, _ := range do.Headers {
headerKeys = append(headerKeys, k)
}
sort.Slice(headerKeys, func(i, j int) bool {
var iOrder int
var jOrder int
var ok bool
if iOrder, ok = headerOrder[strings.ToLower(headerKeys[i])]; !ok {
iOrder = 8
}
if jOrder, ok = headerOrder[strings.ToLower(headerKeys[j])]; !ok {
jOrder = 8
}
if iOrder < jOrder {
return true
}
if iOrder > jOrder {
return false
}
return headerKeys[i] < headerKeys[j]
})
for _, k := range headerKeys {
// Access Headers map directly so as not to canonicalise
v := do.Headers[k]
for _, value := range v {
buf.WriteString("\n\"" + k + ": " + value + "\\n\"")
}
}
// Just as with headers, output translations in consistent order (to minimise diffs between round-trips), with (first) source reference taking priority, followed by context and finally ID
references := make([]SourceReference, 0)
for name, ctx := range do.contexts {
for id, trans := range ctx {
if id == "" {
continue
}
if len(trans.Refs) > 0 {
path, line := extractPathAndLine(trans.Refs[0])
references = append(references, SourceReference{
path,
line,
name,
trans,
})
} else {
references = append(references, SourceReference{
"",
0,
name,
trans,
})
}
}
}
for id, trans := range do.translations {
if id == "" {
continue
}
if len(trans.Refs) > 0 {
path, line := extractPathAndLine(trans.Refs[0])
references = append(references, SourceReference{
path,
line,
"",
trans,
})
} else {
references = append(references, SourceReference{
"",
0,
"",
trans,
})
}
}
sort.Slice(references, func(i, j int) bool {
if references[i].path < references[j].path {
return true
}
if references[i].path > references[j].path {
return false
}
if references[i].line < references[j].line {
return true
}
if references[i].line > references[j].line {
return false
}
if references[i].context < references[j].context {
return true
}
if references[i].context > references[j].context {
return false
}
return references[i].trans.ID < references[j].trans.ID
})
for _, ref := range references {
trans := ref.trans
if len(trans.Refs) > 0 {
buf.WriteString("\n\n#: " + strings.Join(trans.Refs, " "))
} else {
buf.WriteByte(byte('\n'))
}
if ref.context == "" {
buf.WriteString("\nmsgid \"" + trans.ID + "\"")
} else {
buf.WriteString("\nmsgctxt \"" + ref.context + "\"\nmsgid \"" + trans.ID + "\"")
}
if trans.PluralID == "" {
buf.WriteString("\nmsgstr \"" + trans.Trs[0] + "\"")
} else {
buf.WriteString("\nmsgid_plural \"" + trans.PluralID + "\"")
for i, tr := range trans.Trs {
buf.WriteString("\nmsgstr[" + strconv.Itoa(i) + "] \"" + tr + "\"")
}
}
}
return buf.Bytes(), nil
}
// MarshalBinary implements encoding.BinaryMarshaler interface
func (do *Domain) MarshalBinary() ([]byte, error) {
obj := new(TranslatorEncoding)

3
mo.go
View file

@ -8,7 +8,6 @@ package gotext
import (
"bytes"
"encoding/binary"
"net/textproto"
)
const (
@ -49,7 +48,7 @@ Example:
*/
type Mo struct {
//these three public members are for backwards compatibility. they are just set to the value in the domain
Headers textproto.MIMEHeader
Headers HeaderMap
Language string
PluralForms string
domain *Domain

View file

@ -359,6 +359,9 @@ func tokenize(s string) []string {
Eg: (foo) -> true; (foo)(bar) -> false;
*/
if len(s) == 0 {
return []string{}
}
if s[0] == '(' && s[len(s)-1] == ')' {
s = s[1 : len(s)-1]
}

59
po.go
View file

@ -6,7 +6,6 @@
package gotext
import (
"net/textproto"
"strconv"
"strings"
)
@ -37,7 +36,7 @@ Example:
*/
type Po struct {
//these three public members are for backwards compatibility. they are just set to the value in the domain
Headers textproto.MIMEHeader
Headers HeaderMap
Language string
PluralForms string
@ -66,23 +65,50 @@ func (po *Po) GetDomain() *Domain {
return po.domain
}
//all of these functions are for convenience and aid in backwards compatibility
// Convenience interfaces
func (po *Po) DropStaleTranslations() {
po.domain.DropStaleTranslations()
}
func (po *Po) SetRefs(str string, refs []string) {
po.domain.SetRefs(str, refs)
}
func (po *Po) GetRefs(str string) []string {
return po.domain.GetRefs(str)
}
func (po *Po) Set(id, str string) {
po.domain.Set(id, str)
}
func (po *Po) Get(str string, vars ...interface{}) string {
return po.domain.Get(str, vars...)
}
func (po *Po) SetN(id, plural string, n int, str string) {
po.domain.SetN(id, plural, n, str)
}
func (po *Po) GetN(str, plural string, n int, vars ...interface{}) string {
return po.domain.GetN(str, plural, n, vars...)
}
func (po *Po) SetC(id, ctx, str string) {
po.domain.SetC(id, ctx, str)
}
func (po *Po) GetC(str, ctx string, vars ...interface{}) string {
return po.domain.GetC(str, ctx, vars...)
}
func (po *Po) SetNC(id, plural, ctx string, n int, str string) {
po.domain.SetNC(id, plural, ctx, n, str)
}
func (po *Po) GetNC(str, plural string, n int, ctx string, vars ...interface{}) string {
return po.domain.GetNC(str, plural, n, ctx, vars...)
}
func (po *Po) MarshalText() ([]byte, error) {
return po.domain.MarshalText()
}
func (po *Po) MarshalBinary() ([]byte, error) {
return po.domain.MarshalBinary()
}
@ -103,7 +129,7 @@ func (po *Po) ParseFile(f string) {
// Parse loads the translations specified in the provided string (str)
func (po *Po) Parse(buf []byte) {
if po.domain == nil {
panic("po.domain must be set when calling Parse")
panic("NewPo() was not used to instantiate this object")
}
// Lock while parsing
@ -118,6 +144,7 @@ func (po *Po) Parse(buf []byte) {
// Init buffer
po.domain.trBuffer = NewTranslation()
po.domain.ctxBuffer = ""
po.domain.refBuffer = ""
state := head
for _, l := range lines {
@ -126,6 +153,7 @@ func (po *Po) Parse(buf []byte) {
// Skip invalid lines
if !po.isValidLine(l) {
po.parseComment(l, state)
continue
}
@ -198,7 +226,28 @@ func (po *Po) saveBuffer() {
}
// Flush Translation buffer
po.domain.trBuffer = NewTranslation()
if po.domain.refBuffer == "" {
po.domain.trBuffer = NewTranslation()
} else {
po.domain.trBuffer = NewTranslationWithRefs(strings.Split(po.domain.refBuffer, " "))
}
}
// Either preserves comments before the first "msgid", for later round-trip.
// Or preserves source references for a given translation.
func (po *Po) parseComment(l string, state parseState) {
if len(l) > 0 && l[0] == '#' {
if state == head {
po.domain.headerComments = append(po.domain.headerComments, l)
} else if len(l) > 1 {
switch l[1] {
case ':':
if len(l) > 2 {
po.domain.refBuffer = strings.TrimSpace(l[2:])
}
}
}
}
}
// parseContext takes a line starting with "msgctxt",

View file

@ -589,3 +589,149 @@ func TestNewPoTranslatorRace(t *testing.T) {
<-pc
<-rc
}
func TestPoBinaryEncoding(t *testing.T) {
// Create po objects
po := NewPo()
po2 := NewPo()
// Parse file
po.ParseFile("fixtures/en_US/default.po")
buff, err := po.MarshalBinary()
if err != nil {
t.Fatal(err)
}
err = po2.UnmarshalBinary(buff)
if err != nil {
t.Fatal(err)
}
// Test translations
tr := po2.Get("My text")
if tr != "Translated text" {
t.Errorf("Expected 'Translated text' but got '%s'", tr)
}
// Test translations
tr = po2.Get("language")
if tr != "en_US" {
t.Errorf("Expected 'en_US' but got '%s'", tr)
}
}
func TestPoTextEncoding(t *testing.T) {
// Create po objects
po := NewPo()
po2 := NewPo()
// Parse file
po.ParseFile("fixtures/en_US/default.po")
if _, ok := po.Headers["Pot-Creation-Date"]; ok {
t.Errorf("Expected non-canonicalised header, got canonicalised")
} else {
if _, ok = po.Headers["POT-Creation-Date"]; !ok {
t.Errorf("Expected non-canonicalised header, but it was missing")
}
}
// Round-trip
buff, err := po.MarshalText()
if err != nil {
t.Fatal(err)
}
po2.Parse(buff)
for k, v := range po.Headers {
if v2, ok := po2.Headers[k]; ok {
for i, value := range v {
if value != v2[i] {
t.Errorf("TestPoTextEncoding: Header Difference for %s: %s vs %s", k, value, v2[i])
}
}
}
}
// Test translations
tr := po2.Get("My text")
if tr != "Translated text" {
t.Errorf("Expected 'Translated text' but got '%s'", tr)
}
tr = po2.Get("language")
if tr != "en_US" {
t.Errorf("Expected 'en_US' but got '%s'", tr)
}
tr = po2.Get("Some random")
if tr != "Some random translation" {
t.Errorf("Expected 'Some random translation' but got '%s'", tr)
}
v := "Test"
tr = po.GetC("One with var: %s", "Ctx", v)
if tr != "This one is the singular in a Ctx context: Test" {
t.Errorf("Expected 'This one is the singular in a Ctx context: Test' but got '%s'", tr)
}
tr = po.GetNC("One with var: %s", "Several with vars: %s", 17, "Ctx", v)
if tr != "This one is the plural in a Ctx context: Test" {
t.Errorf("Expected 'This one is the plural in a Ctx context: Test' but got '%s'", tr)
}
// Another kind of round-trip
po.Set("My text", "Translated text")
po.Set("language", "en_US")
// But remove 'the'
po.SetNC("One with var: %s", "Several with vars: %s", "Ctx", 1, "This one is singular in a Ctx context: %s")
po.SetNC("One with var: %s", "Several with vars: %s", "Ctx", 17, "This one is plural in a Ctx context: %s")
po.DropStaleTranslations()
buff, err = po.MarshalText()
if err != nil {
t.Fatal(err)
}
po2 = NewPo()
po2.Parse(buff)
for k, v := range po.Headers {
if v2, ok := po2.Headers[k]; ok {
for i, value := range v {
if value != v2[i] {
t.Errorf("Only translations should have been dropped, not headers")
}
}
}
}
tr = po2.Get("My text")
if tr != "Translated text" {
t.Errorf("Expected 'Translated text' but got '%s'", tr)
}
tr = po2.Get("language")
if tr != "en_US" {
t.Errorf("Expected 'en_US' but got '%s'", tr)
}
tr = po2.Get("Some random")
if tr == "Some random translation" || tr != "Some random" {
t.Errorf("Expected 'Some random' translation to be dropped; was present")
}
// With 'the' removed?
v = "Test"
tr = po.GetC("One with var: %s", "Ctx", v)
if tr != "This one is singular in a Ctx context: Test" {
t.Errorf("Expected 'This one is singular in a Ctx context: Test' but got '%s'", tr)
}
tr = po.GetNC("One with var: %s", "Several with vars: %s", 17, "Ctx", v)
if tr != "This one is plural in a Ctx context: Test" {
t.Errorf("Expected 'This one is plural in a Ctx context: Test' but got '%s'", tr)
}
}

View file

@ -10,14 +10,37 @@ type Translation struct {
ID string
PluralID string
Trs map[int]string
Refs []string
dirty bool
}
// NewTranslation returns the Translation object and initialized it.
func NewTranslation() *Translation {
tr := new(Translation)
tr.Trs = make(map[int]string)
return &Translation{
Trs: make(map[int]string),
}
}
return tr
func NewTranslationWithRefs(refs []string) *Translation {
return &Translation{
Trs: make(map[int]string),
Refs: refs,
}
}
func (t *Translation) IsStale() bool {
return t.dirty == false
}
func (t *Translation) SetRefs(refs []string) {
t.Refs = refs
t.dirty = true
}
func (t *Translation) Set(str string) {
t.Trs[0] = str
t.dirty = true
}
// Get returns the string of the translation
@ -33,6 +56,11 @@ func (t *Translation) Get() string {
return t.ID
}
func (t *Translation) SetN(n int, str string) {
t.Trs[n] = str
t.dirty = true
}
// GetN returns the string of the plural translation
func (t *Translation) GetN(n int) string {
// Look for Translation index

View file

@ -8,7 +8,6 @@ package gotext
import (
"errors"
"io/ioutil"
"net/textproto"
"os"
)
@ -31,7 +30,7 @@ type Translator interface {
// TranslatorEncoding is used as intermediary storage to encode Translator objects to Gob.
type TranslatorEncoding struct {
// Headers storage
Headers textproto.MIMEHeader
Headers HeaderMap
// Language header
Language string