Skip to content

Commit 4e043b5

Browse files
committed
Cast to uint64 for all array-based doc representations (#11933)
* Convert all individual values explicitly to uint64 for array-based doc representations * Temporarily test with latest numpy v1.24.0rc * Remove unnecessary conversion from attr_t * Reduce number of individual casts * Convert specifically from int32 to uint64 * Revert "Temporarily test with latest numpy v1.24.0rc" This reverts commit eb0e3c5. * Also use int32 in tests
1 parent b83abde commit 4e043b5

File tree

4 files changed

+14
-11
lines changed

4 files changed

+14
-11
lines changed

spacy/tests/doc/test_array.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
123123

124124
# head before start
125125
arr = doc.to_array(["HEAD"])
126-
arr[0] = -1
126+
arr[0] = numpy.int32(-1).astype(numpy.uint64)
127127
doc_from_array = Doc(en_vocab, words=words)
128128
with pytest.raises(ValueError):
129129
doc_from_array.from_array(["HEAD"], arr)
130130

131131
# head after end
132132
arr = doc.to_array(["HEAD"])
133-
arr[0] = 5
133+
arr[0] = numpy.int32(5).astype(numpy.uint64)
134134
doc_from_array = Doc(en_vocab, words=words)
135135
with pytest.raises(ValueError):
136136
doc_from_array.from_array(["HEAD"], arr)

spacy/tokens/doc.pyx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,7 @@ cdef class Doc:
359359
for annot in annotations:
360360
if annot:
361361
if annot is heads or annot is sent_starts or annot is ent_iobs:
362+
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
362363
for i in range(len(words)):
363364
if attrs.ndim == 1:
364365
attrs[i] = annot[i]
@@ -1558,6 +1559,7 @@ cdef class Doc:
15581559

15591560
for j, (attr, annot) in enumerate(token_annotations.items()):
15601561
if attr is HEAD:
1562+
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
15611563
for i in range(len(words)):
15621564
array[i, j] = annot[i]
15631565
elif attr is MORPH:

spacy/tokens/span.pyx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -299,15 +299,15 @@ cdef class Span:
299299
for ancestor in ancestors:
300300
ancestor_i = ancestor.i - self.c.start
301301
if ancestor_i in range(length):
302-
array[i, head_col] = ancestor_i - i
302+
array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
303303

304304
# if there is no appropriate ancestor, define a new artificial root
305305
value = array[i, head_col]
306306
if (i+value) not in range(length):
307307
new_root = old_to_new_root.get(ancestor_i, None)
308308
if new_root is not None:
309309
# take the same artificial root as a previous token from the same sentence
310-
array[i, head_col] = new_root - i
310+
array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
311311
else:
312312
# set this token as the new artificial root
313313
array[i, head_col] = 0

spacy/training/example.pyx

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
443443
if key not in IDS:
444444
raise ValueError(Errors.E974.format(obj="token", key=key))
445445
elif key in ["ORTH", "SPACY"]:
446-
pass
446+
continue
447447
elif key == "HEAD":
448448
attrs.append(key)
449-
values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
449+
row = [h-i if h is not None else 0 for i, h in enumerate(value)]
450450
elif key == "DEP":
451451
attrs.append(key)
452-
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
452+
row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
453453
elif key == "SENT_START":
454454
attrs.append(key)
455-
values.append([to_ternary_int(v) for v in value])
455+
row = [to_ternary_int(v) for v in value]
456456
elif key == "MORPH":
457457
attrs.append(key)
458-
values.append([vocab.morphology.add(v) for v in value])
458+
row = [vocab.morphology.add(v) for v in value]
459459
else:
460460
attrs.append(key)
461461
if not all(isinstance(v, str) for v in value):
462462
types = set([type(v) for v in value])
463463
raise TypeError(Errors.E969.format(field=key, types=types)) from None
464-
values.append([vocab.strings.add(v) for v in value])
465-
array = numpy.asarray(values, dtype="uint64")
464+
row = [vocab.strings.add(v) for v in value]
465+
values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
466+
array = numpy.array(values, dtype=numpy.uint64)
466467
return attrs, array.T
467468

468469

0 commit comments

Comments
 (0)