使用Goroutines同时加载大型CSV时未定义的行为

May*_*kar 3 csv go goroutine

我正在尝试使用Golang使用goroutines加载一个大的CSV文件.csv的维数是(254882,100).但是当我解析csv并将其存储到2D列表时使用我的goroutine,我得到的行小于254882,并且每次运行的数量都不同.我觉得这是由于goroutines正在发生,但似乎无法指出原因.谁能帮帮我吗.我也是Golang的新人.这是我的代码如下

func loadCSV(csvFile string) (*[][]float64, error) {
    startTime := time.Now()
    var dataset [][]float64
    f, err := os.Open(csvFile)
    if err != nil {
        return &dataset, err
    }
    r := csv.NewReader(bufio.NewReader(f))
    counter := 0
    var wg sync.WaitGroup
    for {
        record, err := r.Read()
        if err == io.EOF {
            break
        }
        if counter != 0 {
            wg.Add(1)
            go func(r []string, dataset *[][]float64) {
                var temp []float64
                for _, each := range record {
                    f, err := strconv.ParseFloat(each, 64)
                    if err == nil {
                        temp = append(temp, f)
                    }
                }
                *dataset = append(*dataset, temp)
                wg.Done()
            }(record, &dataset)
        }
        counter++
    }
    wg.Wait()
    duration := time.Now().Sub(startTime)
    log.Printf("Loaded %d rows in %v seconds", counter, duration)
    return &dataset, nil
}
Run Code Online (Sandbox Code Playgroud)

我的主要功能如下所示

func main() {
    // runtime.GOMAXPROCS(4)
    dataset, err := loadCSV("AvgW2V_train.csv")
    if err != nil {
        panic(err)
    }
    fmt.Println(len(*dataset))
}
Run Code Online (Sandbox Code Playgroud)

如果有人也需要下载CSV,请点击下面的链接(485 MB) https://drive.google.com/file/d/1G4Nw6JyeC-i0R1exWp5BtRtGM1Fwyelm/view?usp=sharing

pet*_*rSO 6

Go Data Race Detector


您的结果未定义,因为您有数据竞争.

~/gopath/src$ go run -race racer.go
==================
WARNING: DATA RACE
Write at 0x00c00008a060 by goroutine 6:
  runtime.mapassign_faststr()
      /home/peter/go/src/runtime/map_faststr.go:202 +0x0
  main.main.func2()
      /home/peter/gopath/src/racer.go:16 +0x6a

Previous write at 0x00c00008a060 by goroutine 5:
  runtime.mapassign_faststr()
      /home/peter/go/src/runtime/map_faststr.go:202 +0x0
  main.main.func1()
      /home/peter/gopath/src/racer.go:11 +0x6a

Goroutine 6 (running) created at:
  main.main()
      /home/peter/gopath/src/racer.go:14 +0x88

Goroutine 5 (running) created at:
  main.main()
      /home/peter/gopath/src/racer.go:9 +0x5b
==================
fatal error: concurrent map writes
==================
WARNING: DATA RACE
Write at 0x00c00009a088 by goroutine 6:
  main.main.func2()
      /home/peter/gopath/src/racer.go:16 +0x7f

Previous write at 0x00c00009a088 by goroutine 5:
  main.main.func1()
      /home/peter/gopath/src/racer.go:11 +0x7f

Goroutine 6 (running) created at:
  main.main()
      /home/peter/gopath/src/racer.go:14 +0x88

Goroutine 5 (running) created at:
  main.main()
      /home/peter/gopath/src/racer.go:9 +0x5b
==================

goroutine 34 [running]:
runtime.throw(0x49e156, 0x15)
    /home/peter/go/src/runtime/panic.go:608 +0x72 fp=0xc000094718 sp=0xc0000946e8 pc=0x44b342
runtime.mapassign_faststr(0x48ace0, 0xc00008a060, 0x49c9c3, 0x8, 0xc00009a088)
    /home/peter/go/src/runtime/map_faststr.go:211 +0x46c fp=0xc000094790 sp=0xc000094718 pc=0x43598c
main.main.func1(0x49c9c3, 0x8)
    /home/peter/gopath/src/racer.go:11 +0x6b fp=0xc0000947d0 sp=0xc000094790 pc=0x47ac6b
runtime.goexit()
    /home/peter/go/src/runtime/asm_amd64.s:1340 +0x1 fp=0xc0000947d8 sp=0xc0000947d0 pc=0x473061
created by main.main
    /home/peter/gopath/src/racer.go:9 +0x5c

goroutine 1 [sleep]:
time.Sleep(0x5f5e100)
    /home/peter/go/src/runtime/time.go:105 +0x14a
main.main()
    /home/peter/gopath/src/racer.go:19 +0x96

goroutine 35 [runnable]:
main.main.func2(0x49c9c3, 0x8)
    /home/peter/gopath/src/racer.go:16 +0x6b
created by main.main
    /home/peter/gopath/src/racer.go:14 +0x89
exit status 2
~/gopath/src$ 
Run Code Online (Sandbox Code Playgroud)

racer.go:

package main

import (
    "bufio"
    "encoding/csv"
    "fmt"
    "io"
    "log"
    "os"
    "strconv"
    "sync"
    "time"
)

func loadCSV(csvFile string) (*[][]float64, error) {
    startTime := time.Now()
    var dataset [][]float64
    f, err := os.Open(csvFile)
    if err != nil {
        return &dataset, err
    }
    r := csv.NewReader(bufio.NewReader(f))
    counter := 0
    var wg sync.WaitGroup
    for {
        record, err := r.Read()
        if err == io.EOF {
            break
        }
        if counter != 0 {
            wg.Add(1)
            go func(r []string, dataset *[][]float64) {
                var temp []float64
                for _, each := range record {
                    f, err := strconv.ParseFloat(each, 64)
                    if err == nil {
                        temp = append(temp, f)
                    }
                }
                *dataset = append(*dataset, temp)
                wg.Done()
            }(record, &dataset)
        }
        counter++
    }
    wg.Wait()
    duration := time.Now().Sub(startTime)
    log.Printf("Loaded %d rows in %v seconds", counter, duration)
    return &dataset, nil
}

func main() {
    // runtime.GOMAXPROCS(4)
    dataset, err := loadCSV("/home/peter/AvgW2V_train.csv")
    if err != nil {
        panic(err)
    }
    fmt.Println(len(*dataset))
}
Run Code Online (Sandbox Code Playgroud)

  • 使用waitgroup的@MayukhSarkar无法解决数据竞争问题.解决方案是使用互斥锁(锁定机制)来防止数据竞争 (3认同)