Skip to content

Commit a6811af

Browse files
authored
observability: annotate Session+SessionPool events (#1207)
This change adds annotations for session and session pool events to aid customers in debugging latency issues with session pool malevolence and also for maintainers to figure out which session pool type is the most appropriate. Updates #1170
1 parent 1d393fe commit a6811af

14 files changed

+602
-201
lines changed

google/cloud/spanner_v1/_helpers.py

+4
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,7 @@ def _retry(
463463
retry_count=5,
464464
delay=2,
465465
allowed_exceptions=None,
466+
beforeNextRetry=None,
466467
):
467468
"""
468469
Retry a function with a specified number of retries, delay between retries, and list of allowed exceptions.
@@ -479,6 +480,9 @@ def _retry(
479480
"""
480481
retries = 0
481482
while retries <= retry_count:
483+
if retries > 0 and beforeNextRetry:
484+
beforeNextRetry(retries, delay)
485+
482486
try:
483487
return func()
484488
except Exception as exc:

google/cloud/spanner_v1/_opentelemetry_tracing.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -81,10 +81,11 @@ def trace_call(name, session, extra_attributes=None, observability_options=None)
8181
tracer = get_tracer(tracer_provider)
8282

8383
# Set base attributes that we know for every trace created
84+
db = session._database
8485
attributes = {
8586
"db.type": "spanner",
8687
"db.url": SpannerClient.DEFAULT_ENDPOINT,
87-
"db.instance": session._database.name,
88+
"db.instance": "" if not db else db.name,
8889
"net.host.name": SpannerClient.DEFAULT_ENDPOINT,
8990
OTEL_SCOPE_NAME: TRACER_NAME,
9091
OTEL_SCOPE_VERSION: TRACER_VERSION,
@@ -106,7 +107,10 @@ def trace_call(name, session, extra_attributes=None, observability_options=None)
106107
yield span
107108
except Exception as error:
108109
span.set_status(Status(StatusCode.ERROR, str(error)))
109-
span.record_exception(error)
110+
# OpenTelemetry-Python imposes invoking span.record_exception on __exit__
111+
# on any exception. We should file a bug later on with them to only
112+
# invoke .record_exception if not already invoked, hence we should not
113+
# invoke .record_exception on our own else we shall have 2 exceptions.
110114
raise
111115
else:
112116
if (not span._status) or span._status.status_code == StatusCode.UNSET:
@@ -116,3 +120,14 @@ def trace_call(name, session, extra_attributes=None, observability_options=None)
116120
# it wasn't previously set otherwise.
117121
# https://github.com/googleapis/python-spanner/issues/1246
118122
span.set_status(Status(StatusCode.OK))
123+
124+
125+
def get_current_span():
126+
if not HAS_OPENTELEMETRY_INSTALLED:
127+
return None
128+
return trace.get_current_span()
129+
130+
131+
def add_span_event(span, event_name, event_attributes=None):
132+
if span:
133+
span.add_event(event_name, event_attributes)

google/cloud/spanner_v1/database.py

+12
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@
6767
SpannerGrpcTransport,
6868
)
6969
from google.cloud.spanner_v1.table import Table
70+
from google.cloud.spanner_v1._opentelemetry_tracing import (
71+
add_span_event,
72+
get_current_span,
73+
)
7074

7175

7276
SPANNER_DATA_SCOPE = "https://www.googleapis.com/auth/spanner.data"
@@ -1164,7 +1168,9 @@ def __init__(
11641168

11651169
def __enter__(self):
11661170
"""Begin ``with`` block."""
1171+
current_span = get_current_span()
11671172
session = self._session = self._database._pool.get()
1173+
add_span_event(current_span, "Using session", {"id": session.session_id})
11681174
batch = self._batch = Batch(session)
11691175
if self._request_options.transaction_tag:
11701176
batch.transaction_tag = self._request_options.transaction_tag
@@ -1187,6 +1193,12 @@ def __exit__(self, exc_type, exc_val, exc_tb):
11871193
extra={"commit_stats": self._batch.commit_stats},
11881194
)
11891195
self._database._pool.put(self._session)
1196+
current_span = get_current_span()
1197+
add_span_event(
1198+
current_span,
1199+
"Returned session to pool",
1200+
{"id": self._session.session_id},
1201+
)
11901202

11911203

11921204
class MutationGroupsCheckout(object):

google/cloud/spanner_v1/pool.py

+166-7
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
import datetime
1818
import queue
19+
import time
1920

2021
from google.cloud.exceptions import NotFound
2122
from google.cloud.spanner_v1 import BatchCreateSessionsRequest
@@ -24,6 +25,10 @@
2425
_metadata_with_prefix,
2526
_metadata_with_leader_aware_routing,
2627
)
28+
from google.cloud.spanner_v1._opentelemetry_tracing import (
29+
add_span_event,
30+
get_current_span,
31+
)
2732
from warnings import warn
2833

2934
_NOW = datetime.datetime.utcnow # unit tests may replace
@@ -196,20 +201,50 @@ def bind(self, database):
196201
when needed.
197202
"""
198203
self._database = database
204+
requested_session_count = self.size - self._sessions.qsize()
205+
span = get_current_span()
206+
span_event_attributes = {"kind": type(self).__name__}
207+
208+
if requested_session_count <= 0:
209+
add_span_event(
210+
span,
211+
f"Invalid session pool size({requested_session_count}) <= 0",
212+
span_event_attributes,
213+
)
214+
return
215+
199216
api = database.spanner_api
200217
metadata = _metadata_with_prefix(database.name)
201218
if database._route_to_leader_enabled:
202219
metadata.append(
203220
_metadata_with_leader_aware_routing(database._route_to_leader_enabled)
204221
)
205222
self._database_role = self._database_role or self._database.database_role
223+
if requested_session_count > 0:
224+
add_span_event(
225+
span,
226+
f"Requesting {requested_session_count} sessions",
227+
span_event_attributes,
228+
)
229+
230+
if self._sessions.full():
231+
add_span_event(span, "Session pool is already full", span_event_attributes)
232+
return
233+
206234
request = BatchCreateSessionsRequest(
207235
database=database.name,
208-
session_count=self.size - self._sessions.qsize(),
236+
session_count=requested_session_count,
209237
session_template=Session(creator_role=self.database_role),
210238
)
211239

240+
returned_session_count = 0
212241
while not self._sessions.full():
242+
request.session_count = requested_session_count - self._sessions.qsize()
243+
add_span_event(
244+
span,
245+
f"Creating {request.session_count} sessions",
246+
span_event_attributes,
247+
)
213248
resp = api.batch_create_sessions(
214249
request=request,
215250
metadata=metadata,
@@ -218,6 +253,13 @@ def bind(self, database):
218253
session = self._new_session()
219254
session._session_id = session_pb.name.split("/")[-1]
220255
self._sessions.put(session)
256+
returned_session_count += 1
257+
258+
add_span_event(
259+
span,
260+
f"Requested for {requested_session_count} sessions, returned {returned_session_count}",
261+
span_event_attributes,
262+
)
221263

222264
def get(self, timeout=None):
223265
"""Check a session out from the pool.
@@ -233,12 +275,43 @@ def get(self, timeout=None):
233275
if timeout is None:
234276
timeout = self.default_timeout
235277

236-
session = self._sessions.get(block=True, timeout=timeout)
237-
age = _NOW() - session.last_use_time
278+
start_time = time.time()
279+
current_span = get_current_span()
280+
span_event_attributes = {"kind": type(self).__name__}
281+
add_span_event(current_span, "Acquiring session", span_event_attributes)
238282

239-
if age >= self._max_age and not session.exists():
240-
session = self._database.session()
241-
session.create()
283+
session = None
284+
try:
285+
add_span_event(
286+
current_span,
287+
"Waiting for a session to become available",
288+
span_event_attributes,
289+
)
290+
291+
session = self._sessions.get(block=True, timeout=timeout)
292+
age = _NOW() - session.last_use_time
293+
294+
if age >= self._max_age and not session.exists():
295+
if not session.exists():
296+
add_span_event(
297+
current_span,
298+
"Session is not valid, recreating it",
299+
span_event_attributes,
300+
)
301+
session = self._database.session()
302+
session.create()
303+
# Replacing with the updated session.id.
304+
span_event_attributes["session.id"] = session._session_id
305+
306+
span_event_attributes["session.id"] = session._session_id
307+
span_event_attributes["time.elapsed"] = time.time() - start_time
308+
add_span_event(current_span, "Acquired session", span_event_attributes)
309+
310+
except queue.Empty as e:
311+
add_span_event(
312+
current_span, "No sessions available in the pool", span_event_attributes
313+
)
314+
raise e
242315

243316
return session
244317

@@ -312,13 +385,32 @@ def get(self):
312385
:returns: an existing session from the pool, or a newly-created
313386
session.
314387
"""
388+
current_span = get_current_span()
389+
span_event_attributes = {"kind": type(self).__name__}
390+
add_span_event(current_span, "Acquiring session", span_event_attributes)
391+
315392
try:
393+
add_span_event(
394+
current_span,
395+
"Waiting for a session to become available",
396+
span_event_attributes,
397+
)
316398
session = self._sessions.get_nowait()
317399
except queue.Empty:
400+
add_span_event(
401+
current_span,
402+
"No sessions available in pool. Creating session",
403+
span_event_attributes,
404+
)
318405
session = self._new_session()
319406
session.create()
320407
else:
321408
if not session.exists():
409+
add_span_event(
410+
current_span,
411+
"Session is not valid, recreating it",
412+
span_event_attributes,
413+
)
322414
session = self._new_session()
323415
session.create()
324416
return session
@@ -427,6 +519,38 @@ def bind(self, database):
427519
session_template=Session(creator_role=self.database_role),
428520
)
429521

522+
span_event_attributes = {"kind": type(self).__name__}
523+
current_span = get_current_span()
524+
requested_session_count = request.session_count
525+
if requested_session_count <= 0:
526+
add_span_event(
527+
current_span,
528+
f"Invalid session pool size({requested_session_count}) <= 0",
529+
span_event_attributes,
530+
)
531+
return
532+
533+
add_span_event(
534+
current_span,
535+
f"Requesting {requested_session_count} sessions",
536+
span_event_attributes,
537+
)
538+
539+
if created_session_count >= self.size:
540+
add_span_event(
541+
current_span,
542+
"Created no new sessions as sessionPool is full",
543+
span_event_attributes,
544+
)
545+
return
546+
547+
add_span_event(
548+
current_span,
549+
f"Creating {request.session_count} sessions",
550+
span_event_attributes,
551+
)
552+
553+
returned_session_count = 0
430554
while created_session_count < self.size:
431555
resp = api.batch_create_sessions(
432556
request=request,
@@ -436,8 +560,16 @@ def bind(self, database):
436560
session = self._new_session()
437561
session._session_id = session_pb.name.split("/")[-1]
438562
self.put(session)
563+
returned_session_count += 1
564+
439565
created_session_count += len(resp.session)
440566

567+
add_span_event(
568+
current_span,
569+
f"Requested for {requested_session_count} sessions, return {returned_session_count}",
570+
span_event_attributes,
571+
)
572+
441573
def get(self, timeout=None):
442574
"""Check a session out from the pool.
443575
@@ -452,7 +584,26 @@ def get(self, timeout=None):
452584
if timeout is None:
453585
timeout = self.default_timeout
454586

455-
ping_after, session = self._sessions.get(block=True, timeout=timeout)
587+
start_time = time.time()
588+
span_event_attributes = {"kind": type(self).__name__}
589+
current_span = get_current_span()
590+
add_span_event(
591+
current_span,
592+
"Waiting for a session to become available",
593+
span_event_attributes,
594+
)
595+
596+
ping_after = None
597+
session = None
598+
try:
599+
ping_after, session = self._sessions.get(block=True, timeout=timeout)
600+
except queue.Empty as e:
601+
add_span_event(
602+
current_span,
603+
"No sessions available in the pool within the specified timeout",
604+
span_event_attributes,
605+
)
606+
raise e
456607

457608
if _NOW() > ping_after:
458609
# Using session.exists() guarantees the returned session exists.
@@ -462,6 +613,14 @@ def get(self, timeout=None):
462613
session = self._new_session()
463614
session.create()
464615

616+
span_event_attributes.update(
617+
{
618+
"time.elapsed": time.time() - start_time,
619+
"session.id": session._session_id,
620+
"kind": "pinging_pool",
621+
}
622+
)
623+
add_span_event(current_span, "Acquired session", span_event_attributes)
465624
return session
466625

467626
def put(self, session):

0 commit comments

Comments
 (0)