Skip to content

Deadlock in statusConnectionPool #20

Closed
@kuningfellow

Description

Deadlock occurs at statusConnectionPool.OnFailure.

https://github.com/elastic/elastic-transport-go/blob/v8.5.0/elastictransport/connection.go#L184

	sort.Slice(cp.dead, func(i, j int) bool {
		c1 := cp.dead[i]
		c2 := cp.dead[j]
		c1.Lock()
		c2.Lock() // <- here
		defer c1.Unlock()
		defer c2.Unlock()

		res := c1.Failures > c2.Failures
		return res
	})

Here is my hypothesis:

	/*
		Initially using Pool 1

			Goroutine 1:                                        |  DiscoverNodes()  | Pool 2
			Goroutine 2: get Pool 1 node        OnFailure()     |                   |  ...............{resurects on Pool 1}
			Goroutine 3: get Pool 1 node                        |                   |  OnFailure()
			Goroutine 4: get Pool 1 node                        |                   |  ....................................OnFailure()
	*/

Here is my test function:

package elastictransport

import (
	"bytes"
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"io"
	"net/http"
	_ "net/http/pprof"
	"net/url"
	"sync"
	"testing"
	"time"
)

type ctxKey int

const shouldError, onTransport = ctxKey(1), ctxKey(2)

var someErr = errors.New("some error")

type mockTransport struct {
	t *testing.T
}

func (m *mockTransport) RoundTrip(req *http.Request) (*http.Response, error) {
	switch req.URL.Path {
	case "/_nodes/http":
		n := nodeInfo{}
		n.HTTP.PublishAddress = "localhost:1234"
		n2 := nodeInfo{}
		n2.HTTP.PublishAddress = "localhost:5678"
		return m.fromJson(map[string]map[string]nodeInfo{
			"nodes": {
				"es0": n,
				"es1": n2,
			},
		}), nil
	default:
		if f, ok := req.Context().Value(onTransport).(func()); ok {
			f()
		}
		if y, _ := req.Context().Value(shouldError).(bool); y {
			return nil, someErr
		}
		return &http.Response{
			StatusCode: 200,
			Body:       io.NopCloser(bytes.NewReader(nil)),
		}, nil
	}
}

func (m *mockTransport) fromJson(v interface{}) *http.Response {
	b, err := json.Marshal(v)
	if err != nil {
		m.t.Fatal(err)
	}
	return &http.Response{
		StatusCode: 200,
		Body:       io.NopCloser(bytes.NewReader(b)),
	}
}

type lg struct{}

func (lg) Log(a ...interface{}) error {
	fmt.Println(a...)
	return nil
}
func (lg) Logf(format string, a ...interface{}) error {
	fmt.Printf(format, a...)
	return nil
}

func TestDeadlock(t *testing.T) {
	go func() {
		http.ListenAndServe(":8080", nil)
	}()
	debugLogger = lg{}
	defaultResurrectTimeoutInitial = 10 * time.Second

	var (
		transport http.RoundTripper = &mockTransport{t}
		makeReq                     = func(ctx context.Context) *http.Request {
			t.Helper()
			req, err := http.NewRequestWithContext(ctx, "GET", "", io.NopCloser(bytes.NewReader(nil)))
			if err != nil {
				t.Fatal(err)
			}
			return req
		}
	)
	u, err := url.Parse("http://localhost:9200") // just the initial node
	if err != nil {
		t.Fatal(err)
	}

	c, err := New(Config{
		Transport:    transport,
		URLs:         []*url.URL{u, u},
		DisableRetry: true,
	})
	if err != nil {
		t.Fatal(err)
	}

	c.DiscoverNodes()

	/*
		Initially using Pool 1

			Goroutine 1:                                        |  DiscoverNodes()  | Pool 2
			Goroutine 2: get Pool 1 node        OnFailure()     |                   |  ...............{resurects on Pool 1}
			Goroutine 3: get Pool 1 node                        |                   |  OnFailure()
			Goroutine 4: get Pool 1 node                        |                   |  ....................................OnFailure()
	*/

	var (
		firstPhaseWg,
		secondPhaseWg,
		thirdPhaseWg,
		getPoolWg,
		discoverNodesWg sync.WaitGroup
	)

	var firstPhases = func(c *Client) {
		defer firstPhaseWg.Done()
		ctx := context.Background()
		ctx = context.WithValue(ctx, shouldError, true)
		ctx = context.WithValue(ctx, onTransport, func() {
			getPoolWg.Done()
		})
		c.Perform(makeReq(ctx))
	}

	var discoverNodesPhase = func(c *Client) {
		defer discoverNodesWg.Done()
		getPoolWg.Wait()
		c.DiscoverNodes()
	}

	var secondPhases = func(c *Client) {
		defer secondPhaseWg.Done()
		ctx := context.Background()
		ctx = context.WithValue(ctx, shouldError, true)
		ctx = context.WithValue(ctx, onTransport, func() {
			getPoolWg.Done()
			firstPhaseWg.Wait()
			discoverNodesWg.Wait()
		})
		c.Perform(makeReq(ctx))
	}

	var thirdPhases = func(c *Client) {
		defer thirdPhaseWg.Done()
		ctx := context.Background()
		ctx = context.WithValue(ctx, shouldError, true)
		ctx = context.WithValue(ctx, onTransport, func() {
			getPoolWg.Done()
			secondPhaseWg.Wait()
			time.Sleep(10 * time.Second)
		})
		c.Perform(makeReq(ctx))
	}

	runPar := func(n int, wg []*sync.WaitGroup, f func(*Client), c *Client) {
		for _, w := range wg {
			w.Add(n)
		}
		for i := 0; i < n; i++ {
			go f(c)
		}
	}

	var N = 10
	runPar(N, []*sync.WaitGroup{&firstPhaseWg, &getPoolWg}, firstPhases, c)
	runPar(N, []*sync.WaitGroup{&secondPhaseWg, &getPoolWg}, secondPhases, c)
	runPar(N, []*sync.WaitGroup{&thirdPhaseWg, &getPoolWg}, thirdPhases, c)
	runPar(1, []*sync.WaitGroup{&discoverNodesWg}, discoverNodesPhase, c)

	thirdPhaseWg.Wait()

	fmt.Println("pool urls", c.pool.URLs())
	fmt.Println("deads", c.pool.(*statusConnectionPool).dead)
}

I can consistently get it to deadlock, and when I open http://localhost:8080/debug/pprof/goroutine?debug=1 I get this profile

1 @ 0x43b356 0x44ca0f 0x44c9e6 0x4694e6 0x474065 0x73d745 0x73d71d 0x490251 0x49083d 0x48fd9a 0x73d429 0x741c09 0x74bda9 0x46d701
#	0x4694e5	sync.runtime_SemacquireMutex+0x25										/home/<user>/go1.20.11/go/src/runtime/sema.go:77
#	0x474064	sync.(*Mutex).lockSlow+0x164											/home/<user>/go1.20.11/go/src/sync/mutex.go:171
#	0x73d744	sync.(*Mutex).Lock+0xa4												/home/<user>/go1.20.11/go/src/sync/mutex.go:90
#	0x73d71c	github.com/elastic/elastic-transport-go/v8/elastictransport.(*statusConnectionPool).OnFailure.func1+0x7c	/home/<user>/github.com/elastic/elastic-transport-go/elastictransport/connection.go:184
#	0x490250	sort.insertionSort_func+0xb0											/home/<user>/go1.20.11/go/src/sort/zsortfunc.go:12
#	0x49083c	sort.pdqsort_func+0x2dc												/home/<user>/go1.20.11/go/src/sort/zsortfunc.go:73
#	0x48fd99	sort.Slice+0xf9													/home/<user>/go1.20.11/go/src/sort/slice.go:26
#	0x73d428	github.com/elastic/elastic-transport-go/v8/elastictransport.(*statusConnectionPool).OnFailure+0x268		/home/<user>/github.com/elastic/elastic-transport-go/elastictransport/connection.go:180
#	0x741c08	github.com/elastic/elastic-transport-go/v8/elastictransport.(*Client).Perform+0x9a8				/home/<user>/github.com/elastic/elastic-transport-go/elastictransport/elastictransport.go:386
#	0x74bda8	github.com/elastic/elastic-transport-go/v8/elastictransport.TestDeadlock.func6+0x148				/home/<user>/github.com/elastic/elastic-transport-go/elastictransport/a_test.go:172

Suggested fix: in *Client.Perform:

  1. Save the pool to a local variable when calling .Next()
  2. Use the local var when calling OnSuccess() or OnFailure()

Thanks.

Activity

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Metadata

Assignees

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions