lia*_*ggo 22 character-encoding go
我需要读取以GBK编码的文本文件.Go编程语言中的标准库假定所有文本都以UTF-8编码.
如何读取其他编码中的文件?
Dav*_*e C 18
以前(如旧答案中所述)这样做的"简单"方法涉及使用需要cgo并包装iconv库的第三方软件包.出于许多原因,这是不希望的.值得庆幸的是,现在已经有一段时间了,只使用Go Authors提供的软件包(不是在主要的软件包集合中,而是在Go Sub-Repositories中)提供了一个优秀的Go方式.
该golang.org/x/text/encoding包定义了可以转换为UTF-8或从UTF-8转换的通用字符编码的接口.在golang.org/x/text/encoding/simplifiedchinese子包提供GB18030,GBK和HZ-GB2312编码实现.
以下是读取和写入GBK编码文件的示例.需要注意的是,io.Reader和io.Writer做"对飞"的编码数据被读取/写入.
package main
import (
"bufio"
"fmt"
"log"
"os"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/transform"
)
// Encoding to use. Since this implements the encoding.Encoding
// interface from golang.org/x/text/encoding you can trivially
// change this out for any of the other implemented encoders,
// e.g. `traditionalchinese.Big5`, `charmap.Windows1252`,
// `korean.EUCKR`, etc.
var enc = simplifiedchinese.GBK
func main() {
const filename = "example_GBK_file"
exampleWriteGBK(filename)
exampleReadGBK(filename)
}
func exampleReadGBK(filename string) {
// Read UTF-8 from a GBK encoded file.
f, err := os.Open(filename)
if err != nil {
log.Fatal(err)
}
r := transform.NewReader(f, enc.NewDecoder())
// Read converted UTF-8 from `r` as needed.
// As an example we'll read line-by-line showing what was read:
sc := bufio.NewScanner(r)
for sc.Scan() {
fmt.Printf("Read line: %s\n", sc.Bytes())
}
if err = sc.Err(); err != nil {
log.Fatal(err)
}
if err = f.Close(); err != nil {
log.Fatal(err)
}
}
func exampleWriteGBK(filename string) {
// Write UTF-8 to a GBK encoded file.
f, err := os.Create(filename)
if err != nil {
log.Fatal(err)
}
w := transform.NewWriter(f, enc.NewEncoder())
// Write UTF-8 to `w` as desired.
// As an example we'll write some text from the Wikipedia
// GBK page that includes Chinese.
_, err = fmt.Fprintln(w,
`In 1995, China National Information Technology Standardization
Technical Committee set down the Chinese Internal Code Specification
(Chinese: ?????????GBK?; pinyin: Hànzì Nèim?
Kuòzh?n Gu?fàn (GBK)), Version 1.0, known as GBK 1.0, which is a
slight extension of Codepage 936. The newly added 95 characters were not
found in GB 13000.1-1993, and were provisionally assigned Unicode PUA
code points.`)
if err != nil {
log.Fatal(err)
}
if err = f.Close(); err != nil {
log.Fatal(err)
}
}
Run Code Online (Sandbox Code Playgroud)