Den*_*ret 25 utf-8 character-encoding go
当你的XML输入没有用UTF-8编码时,Unmarshalxml包的功能似乎需要一个CharsetReader.
你在哪里找到这样的东西?
mor*_*aes 40
更新了2015年及以后的答案:
import (
"encoding/xml"
"golang.org/x/net/html/charset"
)
decoder := xml.NewDecoder(reader)
decoder.CharsetReader = charset.NewReaderLabel
err = decoder.Decode(&parsed)
Run Code Online (Sandbox Code Playgroud)
Jon*_*nno 23
扩展@ anschel-schaffer-cohen建议和@ mjibson的评论,使用上面提到的go-charset包允许你使用这三行
decoder := xml.NewDecoder(reader)
decoder.CharsetReader = charset.NewReader
err = decoder.Decode(&parsed)
Run Code Online (Sandbox Code Playgroud)
达到要求的结果.只记得charset通过调用来了解其数据文件的位置
charset.CharsetDir = ".../src/code.google.com/p/go-charset/datafiles"
Run Code Online (Sandbox Code Playgroud)
在应用程序启动时的某个时刻.
编辑
而不是上述charset.CharsetDir =等,只是导入数据文件更明智.它们被视为嵌入式资源:
import (
"code.google.com/p/go-charset/charset"
_ "code.google.com/p/go-charset/data"
...
)
Run Code Online (Sandbox Code Playgroud)
go install 只会做它的事情,这也避免了部署头痛(我在哪里/如何获取相对于正在执行的应用程序的数据文件?).
使用带下划线的import只调用包的init()func,它将所需的东西加载到内存中.
pet*_*rSO 12
这是一个示例Go程序,它使用CharsetReader函数将XML输入从ISO-8859-1转换为UTF-8.该程序打印测试文件XML注释.
package main
import (
"bytes"
"fmt"
"io"
"os"
"strings"
"utf8"
"xml"
)
type CharsetISO88591er struct {
r io.ByteReader
buf *bytes.Buffer
}
func NewCharsetISO88591(r io.Reader) *CharsetISO88591er {
buf := bytes.NewBuffer(make([]byte, 0, utf8.UTFMax))
return &CharsetISO88591er{r.(io.ByteReader), buf}
}
func (cs *CharsetISO88591er) ReadByte() (b byte, err os.Error) {
// http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT
// Date: 1999 July 27; Last modified: 27-Feb-2001 05:08
if cs.buf.Len() <= 0 {
r, err := cs.r.ReadByte()
if err != nil {
return 0, err
}
if r < utf8.RuneSelf {
return r, nil
}
cs.buf.WriteRune(int(r))
}
return cs.buf.ReadByte()
}
func (cs *CharsetISO88591er) Read(p []byte) (int, os.Error) {
// Use ReadByte method.
return 0, os.EINVAL
}
func isCharset(charset string, names []string) bool {
charset = strings.ToLower(charset)
for _, n := range names {
if charset == strings.ToLower(n) {
return true
}
}
return false
}
func IsCharsetISO88591(charset string) bool {
// http://www.iana.org/assignments/character-sets
// (last updated 2010-11-04)
names := []string{
// Name
"ISO_8859-1:1987",
// Alias (preferred MIME name)
"ISO-8859-1",
// Aliases
"iso-ir-100",
"ISO_8859-1",
"latin1",
"l1",
"IBM819",
"CP819",
"csISOLatin1",
}
return isCharset(charset, names)
}
func IsCharsetUTF8(charset string) bool {
names := []string{
"UTF-8",
// Default
"",
}
return isCharset(charset, names)
}
func CharsetReader(charset string, input io.Reader) (io.Reader, os.Error) {
switch {
case IsCharsetUTF8(charset):
return input, nil
case IsCharsetISO88591(charset):
return NewCharsetISO88591(input), nil
}
return nil, os.NewError("CharsetReader: unexpected charset: " + charset)
}
func main() {
// Print the XML comments from the test file, which should
// contain most of the printable ISO-8859-1 characters.
r, err := os.Open("ISO88591.xml")
if err != nil {
fmt.Println(err)
return
}
defer r.Close()
fmt.Println("file:", r.Name())
p := xml.NewParser(r)
p.CharsetReader = CharsetReader
for t, err := p.Token(); t != nil && err == nil; t, err = p.Token() {
switch t := t.(type) {
case xml.ProcInst:
fmt.Println(t.Target, string(t.Inst))
case xml.Comment:
fmt.Println(string([]byte(t)))
}
}
}
Run Code Online (Sandbox Code Playgroud)
与组XML encoding="ISO-8859-1"从io.Reader r成结构result,在使用CharsetReader功能从程序,从翻译ISO-8859-1到UTF-8,写:
p := xml.NewParser(r)
p.CharsetReader = CharsetReader
err := p.Unmarshal(&result, nil)
Run Code Online (Sandbox Code Playgroud)
编辑:不要使用这个,使用go-charset答案.
这是@peterSO代码的更新版本,与go1一起使用:
package main
import (
"bytes"
"io"
"strings"
)
type CharsetISO88591er struct {
r io.ByteReader
buf *bytes.Buffer
}
func NewCharsetISO88591(r io.Reader) *CharsetISO88591er {
buf := bytes.Buffer{}
return &CharsetISO88591er{r.(io.ByteReader), &buf}
}
func (cs *CharsetISO88591er) Read(p []byte) (n int, err error) {
for _ = range p {
if r, err := cs.r.ReadByte(); err != nil {
break
} else {
cs.buf.WriteRune(rune(r))
}
}
return cs.buf.Read(p)
}
func isCharset(charset string, names []string) bool {
charset = strings.ToLower(charset)
for _, n := range names {
if charset == strings.ToLower(n) {
return true
}
}
return false
}
func IsCharsetISO88591(charset string) bool {
// http://www.iana.org/assignments/character-sets
// (last updated 2010-11-04)
names := []string{
// Name
"ISO_8859-1:1987",
// Alias (preferred MIME name)
"ISO-8859-1",
// Aliases
"iso-ir-100",
"ISO_8859-1",
"latin1",
"l1",
"IBM819",
"CP819",
"csISOLatin1",
}
return isCharset(charset, names)
}
func CharsetReader(charset string, input io.Reader) (io.Reader, error) {
if IsCharsetISO88591(charset) {
return NewCharsetISO88591(input), nil
}
return input, nil
}
Run Code Online (Sandbox Code Playgroud)
叫:
d := xml.NewDecoder(reader)
d.CharsetReader = CharsetReader
err := d.Decode(&dst)
Run Code Online (Sandbox Code Playgroud)
| 归档时间: |
|
| 查看次数: |
9888 次 |
| 最近记录: |