Number
Python3中有六个标准的数据类型:
- Number(数字)
- String(字符串)
- List(列表)
- Tuple(元组)
- Set(集合)
- Dictionary(字典)
Python3支持int
、float
、complex
(复数)3种Number
数据类型,其中bool
(Booleans)型为int
型的子类。
本章节介绍其中的int
数据类型。
int
类型对象
int
在python
中,也是一个对象,也就是类型对象。其定义在Python/bltinmodule.c
文件中:
...
SETBUILTIN("property", &PyProperty_Type);
SETBUILTIN("int", &PyLong_Type);
SETBUILTIN("list", &PyList_Type);
...
也就是对应着PyLong_Type
,其定义在Objects/longobject.c
:
PyTypeObject PyLong_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
"int", /* tp_name */
offsetof(PyLongObject, ob_digit), /* tp_basicsize */
sizeof(digit), /* tp_itemsize */
long_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_reserved */
long_to_decimal_string, /* tp_repr */
&long_as_number, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
(hashfunc)long_hash, /* tp_hash */
0, /* tp_call */
long_to_decimal_string, /* tp_str */
PyObject_GenericGetAttr, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Py_TPFLAGS_LONG_SUBCLASS, /* tp_flags */
long_doc, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
long_richcompare, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
long_methods, /* tp_methods */
0, /* tp_members */
long_getset, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
0, /* tp_init */
0, /* tp_alloc */
long_new, /* tp_new */
PyObject_Del, /* tp_free */
};
可以看出int
同样是PyTypeObject
的一个实例,来作为类型对象。
整数对象的创建
对象的创建时通过调用类型对象中的tp_new
方法实现的,从上面int
的实现可以看到int
对象的创建是通过long_new
方法实现的。
long_new
方法定义在Objects/clinic/longobject.c.h
中:
static PyObject *
long_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
static PyObject *
long_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
PyObject *obase = NULL, *x = NULL;
Py_ssize_t base;
static char *kwlist[] = {"x", "base", 0};
if (type != &PyLong_Type)
return long_subtype_new(type, args, kwds); /* Wimp out */
if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OO:int", kwlist,
&x, &obase))
return NULL;
if (x == NULL) {
if (obase != NULL) {
PyErr_SetString(PyExc_TypeError,
"int() missing string argument");
return NULL;
}
return PyLong_FromLong(0L);
}
if (obase == NULL)
return PyNumber_Long(x);
base = PyNumber_AsSsize_t(obase, NULL);
if (base == -1 && PyErr_Occurred())
return NULL;
if ((base != 0 && base < 2) || base > 36) {
PyErr_SetString(PyExc_ValueError,
"int() base must be >= 2 and <= 36, or 0");
return NULL;
}
if (PyUnicode_Check(x))
return PyLong_FromUnicodeObject(x, (int)base);
else if (PyByteArray_Check(x) || PyBytes_Check(x)) {
char *string;
if (PyByteArray_Check(x))
string = PyByteArray_AS_STRING(x);
else
string = PyBytes_AS_STRING(x);
return _PyLong_FromBytes(string, Py_SIZE(x), (int)base);
}
else {
PyErr_SetString(PyExc_TypeError,
"int() can't convert non-string with explicit base");
return NULL;
}
}
对照着Python3
中int
类型的Docstring
来看:
PyDoc_STRVAR(long_doc,
"int(x=0) -> integer\n\
int(x, base=10) -> integer\n\
\n\
Convert a number or string to an integer, or return 0 if no arguments\n\
are given. If x is a number, return x.__int__(). For floating point\n\
numbers, this truncates towards zero.\n\
\n\
If x is not a number or if base is given, then x must be a string,\n\
bytes, or bytearray instance representing an integer literal in the\n\
given base. The literal can be preceded by '+' or '-' and be surrounded\n\
by whitespace. The base defaults to 10. Valid bases are 0 and 2-36.\n\
Base 0 means to interpret the base from the string as an integer literal.\n\
>>> int('0b100', base=0)\n\
4");
可以看出,int
的参数x
和base
对应着long_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
中的args
和kwds
。
函数的内部逻辑为:
- 首先进行类型判断;
- 参数解析,尝试看了下
PyArg_ParseTupleAndKeywords
函数的内部逻辑,然后放弃了2333,大概作用是将args
和kwds
参数做类型判断并解析赋值到x
和base
中; - 然后进行参数判断,当
x
为NULL
时: 3.1.base
不为空,则返回错误"int() missing string argument"
3.2.``` >>> a = int(base=10) Traceback (most recent call last): File "<stdin>", line 1, in <module> TypeError: int() missing string argument ```
base
同时为空,则返回0``` >>> a = int() >>> a 0 ```
然后继续进行参数判断,当
x
不为NULL
时: 4.1.base
为空,返回PyNumber_Long(x)
``` >>> print(int(10)) 10 >>> print(int('10')) 10 ```
4.2.
base
不为NULL
,那么对x
和base
进行一系列检查判断``` >>> print(int(1, base=37)) Traceback (most recent call last): File "<stdin>", line 1, in <module> ValueError: int() base must be >= 2 and <= 36, or 0 #为什么是36呢?因为`0-9`加上`A-Z`刚好36呀 >>> print(int(10, base=36)) Traceback (most recent call last): File "<stdin>", line 1, in <module> TypeError: int() can't convert non-string with explicit base #指定base时,则x不能为非字符串 >>> print(int('1Z', base=36)) 71 ```
4.3. 当
x
和base
都满足条件时:* `PyUnicode` -> `PyLong_FromUnicodeObject` -> `PyLong_FromString` * `PyByteArray/PyBytes` -> `_PyLong_FromBytes` -> `PyLong_FromString` `PyLong_FromUnicodeObject`和`_PyLong_FromBytes`都是将`PyUnicode`对象和`PyByteArrry/PyBytes`对象转化为`string`(c中的`char *`),然后再调用`PyLong_FromString`
可以看到最终整数对象的创建时通过PyLong_FromLong
、 PyNumber_Long
和PyLong_FromString
几个函数来完成的。
PyLong_FromLong
PyObject *
PyLong_FromLong(long ival)
{
PyLongObject *v;
unsigned long abs_ival;
unsigned long t; /* unsigned so >> doesn't propagate sign bit */
int ndigits = 0;
int sign;
CHECK_SMALL_INT(ival);
if (ival < 0) {
/* negate: can't write this as abs_ival = -ival since that
invokes undefined behaviour when ival is LONG_MIN */
abs_ival = 0U-(unsigned long)ival;
sign = -1;
}
else {
abs_ival = (unsigned long)ival;
sign = ival == 0 ? 0 : 1;
}
/* Fast path for single-digit ints */
if (!(abs_ival >> PyLong_SHIFT)) {
v = _PyLong_New(1);
if (v) {
Py_SIZE(v) = sign;
v->ob_digit[0] = Py_SAFE_DOWNCAST(
abs_ival, unsigned long, digit);
}
return (PyObject*)v;
}
#if PyLong_SHIFT==15
/* 2 digits */
if (!(abs_ival >> 2*PyLong_SHIFT)) {
v = _PyLong_New(2);
if (v) {
Py_SIZE(v) = 2*sign;
v->ob_digit[0] = Py_SAFE_DOWNCAST(
abs_ival & PyLong_MASK, unsigned long, digit);
v->ob_digit[1] = Py_SAFE_DOWNCAST(
abs_ival >> PyLong_SHIFT, unsigned long, digit);
}
return (PyObject*)v;
}
#endif
/* Larger numbers: loop to determine number of digits */
t = abs_ival;
while (t) {
++ndigits;
t >>= PyLong_SHIFT;
}
v = _PyLong_New(ndigits);
if (v != NULL) {
digit *p = v->ob_digit;
Py_SIZE(v) = ndigits*sign;
t = abs_ival;
while (t) {
*p++ = Py_SAFE_DOWNCAST(
t & PyLong_MASK, unsigned long, digit);
t >>= PyLong_SHIFT;
}
}
return (PyObject *)v;
}
首先看看前面的CHECK_SMALL_INT(ival);
static PyObject *
get_small_int(sdigit ival)
{
PyObject *v;
assert(-NSMALLNEGINTS <= ival && ival < NSMALLPOSINTS);
v = (PyObject *)&small_ints[ival + NSMALLNEGINTS];
Py_INCREF(v);
#ifdef COUNT_ALLOCS
if (ival >= 0)
quick_int_allocs++;
else
quick_neg_int_allocs++;
#endif
return v;
}
#define CHECK_SMALL_INT(ival) \
do if (-NSMALLNEGINTS <= ival && ival < NSMALLPOSINTS) { \
return get_small_int((sdigit)ival); \
} while(0)
这里涉及到两个宏:NSMALLNEGINTS
和NSMALLPOSINTS
#ifndef NSMALLPOSINTS
#define NSMALLPOSINTS 257
#endif
#ifndef NSMALLNEGINTS
#define NSMALLNEGINTS 5
#endif
也就是说当-5 <= ival < 257
时,直接返回small_ints[ival + NSMALLNEGINTS]
,也就是当ival
等于-5的时候,返回small_ints[0]
.
我们再来看一个问题:
>>> a = -6
>>> b = -6
>>> a is b
False
>>> a = -5
>>> b = -5
>>> a is b
True
>>> a = 256
>>> b = 256
>>> a is b
True
>>> a = 257
>>> b = 257
>>> a is b
False
可以看出,这种差异的产生肯定和small_ints
有着某种联系。
Python
在创建整数对象时,分为小整数和大整数区别对待。
小整数
#if NSMALLNEGINTS + NSMALLPOSINTS > 0
/* Small integers are preallocated in this array so that they
can be shared.
The integers that are preallocated are those in the range
-NSMALLNEGINTS (inclusive) to NSMALLPOSINTS (not inclusive).
*/
static PyLongObject small_ints[NSMALLNEGINTS + NSMALLPOSINTS];
当Python
初始化调用_PyLong_Init
函数时完成对small_ints
的初始化。
int
_PyLong_Init(void)
{
#if NSMALLNEGINTS + NSMALLPOSINTS > 0
int ival, size;
PyLongObject *v = small_ints;
for (ival = -NSMALLNEGINTS; ival < NSMALLPOSINTS; ival++, v++) {
size = (ival < 0) ? -1 : ((ival == 0) ? 0 : 1);
if (Py_TYPE(v) == &PyLong_Type) {
/* The element is already initialized, most likely
* the Python interpreter was initialized before.
*/
Py_ssize_t refcnt;
PyObject* op = (PyObject*)v;
refcnt = Py_REFCNT(op) < 0 ? 0 : Py_REFCNT(op);
_Py_NewReference(op);
/* _Py_NewReference sets the ref count to 1 but
* the ref count might be larger. Set the refcnt
* to the original refcnt + 1 */
Py_REFCNT(op) = refcnt + 1;
assert(Py_SIZE(op) == size);
assert(v->ob_digit[0] == (digit)abs(ival));
}
else {
(void)PyObject_INIT(v, &PyLong_Type);
}
Py_SIZE(v) = size;
v->ob_digit[0] = (digit)abs(ival);
}
#endif
/* initialize int_info */
if (Int_InfoType.tp_name == NULL) {
if (PyStructSequence_InitType2(&Int_InfoType, &int_info_desc) < 0)
return 0;
}
return 1;
}
在for
循环中我们可以看到,python将存储[-5, 257)之间的小整数对象分配到small_ints
指针数组中,数组中的每一个PyLongObject
对象都能被任意地共享。这也就能解释了上面问题中出现的差异。
那么,整这么个小整数数组处理的目的是啥呢?
小整数一般是Python
程序中高频使用的整数对象,使用小整数对象池可以避免在一些简单的循环和计算中频繁的调用malloc
分配内存和free
释放内存。这样频繁的内存操作不仅降低运行效率吗,而且会在系统堆上造成大量内存碎片,严重影响Python
整体性能。
大整数对象
Python2
中的整数对象其实有两种:PyIntObject
和PyLongObject
,在Python2中的大整数其实指的是PyIntObject
范围内的不在[-NSMALLNEGINTS, NSMALLPOSINTS)
这个区间的值,而Python3
中只有PyLongObject
一种整数对象,所以大整数指的就是除小整数以外的值。
Python2
中的PyIntObject
定义:
typedef struct {
PyObject_HEAD
long ob_ival;
} PyIntObject
Python2
中的PyIntObject
实际是对c
中的long
的包装。所以Python2
也提供了专门的缓存池,供大整数轮流使用,避免每次使用不断的malloc
分配内存带来的效率损耗。
Python3
中的PyLongObject
就没有使用缓存池,直接malloc/free
操作内存,这也是Python3
的性能损耗的一个方面。
下面就是PyLong_FromLong调用申请内存空间的函数_PyLong_New:
/* Allocate a new int object with size digits.
Return NULL and set exception if we run out of memory. */
#define MAX_LONG_DIGITS \
((PY_SSIZE_T_MAX - offsetof(PyLongObject, ob_digit))/sizeof(digit))
PyLongObject *
_PyLong_New(Py_ssize_t size)
{
PyLongObject *result;
/* Number of bytes needed is: offsetof(PyLongObject, ob_digit) +
sizeof(digit)*size. Previous incarnations of this code used
sizeof(PyVarObject) instead of the offsetof, but this risks being
incorrect in the presence of padding between the PyVarObject header
and the digits. */
if (size > (Py_ssize_t)MAX_LONG_DIGITS) {
PyErr_SetString(PyExc_OverflowError,
"too many digits in integer");
return NULL;
}
result = PyObject_MALLOC(offsetof(PyLongObject, ob_digit) +
size*sizeof(digit));
if (!result) {
PyErr_NoMemory();
return NULL;
}
return (PyLongObject*)PyObject_INIT_VAR(result, &PyLong_Type, size);
}
可以看出来申请的内存大小是ob_digit
相对结构体PyLongObject
的偏移大小加上size * digit
类型的大小。
PyNumber_Long
PyNumber_Long
定义在Objects/abstract.c
:
PyObject *
PyNumber_Long(PyObject *o)
{
PyObject *result;
PyNumberMethods *m;
PyObject *trunc_func;
Py_buffer view;
_Py_IDENTIFIER(__trunc__);
if (o == NULL) {
return null_error();
}
if (PyLong_CheckExact(o)) {
Py_INCREF(o);
return o;
}
m = o->ob_type->tp_as_number;
if (m && m->nb_int) { /* This should include subclasses of int */
result = (PyObject *)_PyLong_FromNbInt(o);
if (result != NULL && !PyLong_CheckExact(result)) {
Py_SETREF(result, _PyLong_Copy((PyLongObject *)result));
}
return result;
}
trunc_func = _PyObject_LookupSpecial(o, &PyId___trunc__);
if (trunc_func) {
result = _PyObject_CallNoArg(trunc_func);
Py_DECREF(trunc_func);
if (result == NULL || PyLong_CheckExact(result)) {
return result;
}
if (PyLong_Check(result)) {
Py_SETREF(result, _PyLong_Copy((PyLongObject *)result));
return result;
}
/* __trunc__ is specified to return an Integral type,
but int() needs to return an int. */
m = result->ob_type->tp_as_number;
if (m == NULL || m->nb_int == NULL) {
PyErr_Format(
PyExc_TypeError,
"__trunc__ returned non-Integral (type %.200s)",
result->ob_type->tp_name);
Py_DECREF(result);
return NULL;
}
Py_SETREF(result, (PyObject *)_PyLong_FromNbInt(result));
if (result != NULL && !PyLong_CheckExact(result)) {
Py_SETREF(result, _PyLong_Copy((PyLongObject *)result));
}
return result;
}
if (PyErr_Occurred())
return NULL;
if (PyUnicode_Check(o))
/* The below check is done in PyLong_FromUnicode(). */
return PyLong_FromUnicodeObject(o, 10);
if (PyBytes_Check(o))
/* need to do extra error checking that PyLong_FromString()
* doesn't do. In particular int('9\x005') must raise an
* exception, not truncate at the null.
*/
return _PyLong_FromBytes(PyBytes_AS_STRING(o),
PyBytes_GET_SIZE(o), 10);
if (PyByteArray_Check(o))
return _PyLong_FromBytes(PyByteArray_AS_STRING(o),
PyByteArray_GET_SIZE(o), 10);
if (PyObject_GetBuffer(o, &view, PyBUF_SIMPLE) == 0) {
PyObject *bytes;
/* Copy to NUL-terminated buffer. */
bytes = PyBytes_FromStringAndSize((const char *)view.buf, view.len);
if (bytes == NULL) {
PyBuffer_Release(&view);
return NULL;
}
result = _PyLong_FromBytes(PyBytes_AS_STRING(bytes),
PyBytes_GET_SIZE(bytes), 10);
Py_DECREF(bytes);
PyBuffer_Release(&view);
return result;
}
return type_error("int() argument must be a string, a bytes-like object "
"or a number, not '%.200s'", o);
}
PyNumber_Long
处理了obase
为NULL
的情况,对于能支持数值对象操作(o->ob_type->tp_as_number->nb_int
)的,并且会借助_PyLong_Copy
和Py_SETREF
实现了把数值的PyObject *
对象转成PyLongObject *
对象。
// Include/object.h
#define Py_SETREF(op, op2) \
do { \
PyObject *_py_tmp = (PyObject *)(op); \
(op) = (op2); \
Py_DECREF(_py_tmp); \
} while (0)
// Object/longobject.c
PyObject *
_PyLong_Copy(PyLongObject *src)
{
PyLongObject *result;
Py_ssize_t i;
assert(src != NULL);
i = Py_SIZE(src);
if (i < 0)
i = -(i);
if (i < 2) {
sdigit ival = MEDIUM_VALUE(src);
CHECK_SMALL_INT(ival);
}
result = _PyLong_New(i);
if (result != NULL) {
Py_SIZE(result) = Py_SIZE(src);
while (--i >= 0)
result->ob_digit[i] = src->ob_digit[i];
}
return (PyObject *)result;
}
在Python对象的类型中,有三组重要的操作族:
tp_as_number
、tp_as_sequence
、tp_as_mapping
,它们分别指向PyNumberMethods
、PySequenceMethods
和PyMappingMethods
函数族。如果一个对象能被视为数值对象,那么在对应的类型中,tp_as_number
指向对该对象的数值操作。同样,PySequenceMethods
和PyMappingMethods
分别定义了作为一个序列对象和关联对象应该支持的行为,这两种对象的典型例子是list
和dict
。 摘自:《Python源码剖析》 — 陈儒
不能支持数值对象操作的其他对象分别处理,但是最终都是由PyLong_FromString
来处理。
PyLong_FromString
/* Parses an int from a bytestring. Leading and trailing whitespace will be
* ignored.
*
* If successful, a PyLong object will be returned and 'pend' will be pointing
* to the first unused byte unless it's NULL.
*
* If unsuccessful, NULL will be returned.
*/
PyObject *
PyLong_FromString(const char *str, char **pend, int base)
{
int sign = 1, error_if_nonzero = 0;
const char *start, *orig_str = str;
PyLongObject *z = NULL;
PyObject *strobj;
Py_ssize_t slen;
if ((base != 0 && base < 2) || base > 36) {
PyErr_SetString(PyExc_ValueError,
"int() arg 2 must be >= 2 and <= 36");
return NULL;
}
while (*str != '\0' && Py_ISSPACE(Py_CHARMASK(*str))) {
str++;
}
if (*str == '+') {
++str;
}
else if (*str == '-') {
++str;
sign = -1;
}
if (base == 0) {
if (str[0] != '0') {
base = 10;
}
else if (str[1] == 'x' || str[1] == 'X') {
base = 16;
}
else if (str[1] == 'o' || str[1] == 'O') {
base = 8;
}
else if (str[1] == 'b' || str[1] == 'B') {
base = 2;
}
else {
/* "old" (C-style) octal literal, now invalid.
it might still be zero though */
error_if_nonzero = 1;
base = 10;
}
}
if (str[0] == '0' &&
((base == 16 && (str[1] == 'x' || str[1] == 'X')) ||
(base == 8 && (str[1] == 'o' || str[1] == 'O')) ||
(base == 2 && (str[1] == 'b' || str[1] == 'B')))) {
str += 2;
/* One underscore allowed here. */
if (*str == '_') {
++str;
}
}
if (str[0] == '_') {
/* May not start with underscores. */
goto onError;
}
start = str;
if ((base & (base - 1)) == 0) {
int res = long_from_binary_base(&str, base, &z);
if (res < 0) {
/* Syntax error. */
goto onError;
}
}
else {
/***
Binary bases can be converted in time linear in the number of digits, because
Python's representation base is binary. Other bases (including decimal!) use
the simple quadratic-time algorithm below, complicated by some speed tricks.
First some math: the largest integer that can be expressed in N base-B digits
is B**N-1. Consequently, if we have an N-digit input in base B, the worst-
case number of Python digits needed to hold it is the smallest integer n s.t.
BASE**n-1 >= B**N-1 [or, adding 1 to both sides]
BASE**n >= B**N [taking logs to base BASE]
n >= log(B**N)/log(BASE) = N * log(B)/log(BASE)
The static array log_base_BASE[base] == log(base)/log(BASE) so we can compute
this quickly. A Python int with that much space is reserved near the start,
and the result is computed into it.
The input string is actually treated as being in base base**i (i.e., i digits
are processed at a time), where two more static arrays hold:
convwidth_base[base] = the largest integer i such that base**i <= BASE
convmultmax_base[base] = base ** convwidth_base[base]
The first of these is the largest i such that i consecutive input digits
must fit in a single Python digit. The second is effectively the input
base we're really using.
Viewing the input as a sequence <c0, c1, ..., c_n-1> of digits in base
convmultmax_base[base], the result is "simply"
(((c0*B + c1)*B + c2)*B + c3)*B + ... ))) + c_n-1
where B = convmultmax_base[base].
Error analysis: as above, the number of Python digits `n` needed is worst-
case
n >= N * log(B)/log(BASE)
where `N` is the number of input digits in base `B`. This is computed via
size_z = (Py_ssize_t)((scan - str) * log_base_BASE[base]) + 1;
below. Two numeric concerns are how much space this can waste, and whether
the computed result can be too small. To be concrete, assume BASE = 2**15,
which is the default (and it's unlikely anyone changes that).
Waste isn't a problem: provided the first input digit isn't 0, the difference
between the worst-case input with N digits and the smallest input with N
digits is about a factor of B, but B is small compared to BASE so at most
one allocated Python digit can remain unused on that count. If
N*log(B)/log(BASE) is mathematically an exact integer, then truncating that
and adding 1 returns a result 1 larger than necessary. However, that can't
happen: whenever B is a power of 2, long_from_binary_base() is called
instead, and it's impossible for B**i to be an integer power of 2**15 when
B is not a power of 2 (i.e., it's impossible for N*log(B)/log(BASE) to be
an exact integer when B is not a power of 2, since B**i has a prime factor
other than 2 in that case, but (2**15)**j's only prime factor is 2).
The computed result can be too small if the true value of N*log(B)/log(BASE)
is a little bit larger than an exact integer, but due to roundoff errors (in
computing log(B), log(BASE), their quotient, and/or multiplying that by N)
yields a numeric result a little less than that integer. Unfortunately, "how
close can a transcendental function get to an integer over some range?"
questions are generally theoretically intractable. Computer analysis via
continued fractions is practical: expand log(B)/log(BASE) via continued
fractions, giving a sequence i/j of "the best" rational approximations. Then
j*log(B)/log(BASE) is approximately equal to (the integer) i. This shows that
we can get very close to being in trouble, but very rarely. For example,
76573 is a denominator in one of the continued-fraction approximations to
log(10)/log(2**15), and indeed:
>>> log(10)/log(2**15)*76573
16958.000000654003
is very close to an integer. If we were working with IEEE single-precision,
rounding errors could kill us. Finding worst cases in IEEE double-precision
requires better-than-double-precision log() functions, and Tim didn't bother.
Instead the code checks to see whether the allocated space is enough as each
new Python digit is added, and copies the whole thing to a larger int if not.
This should happen extremely rarely, and in fact I don't have a test case
that triggers it(!). Instead the code was tested by artificially allocating
just 1 digit at the start, so that the copying code was exercised for every
digit beyond the first.
***/
twodigits c; /* current input character */
Py_ssize_t size_z;
int digits = 0;
int i;
int convwidth;
twodigits convmultmax, convmult;
digit *pz, *pzstop;
const char *scan, *lastdigit;
char prev = 0;
static double log_base_BASE[37] = {0.0e0,};
static int convwidth_base[37] = {0,};
static twodigits convmultmax_base[37] = {0,};
if (log_base_BASE[base] == 0.0) {
twodigits convmax = base;
int i = 1;
log_base_BASE[base] = (log((double)base) /
log((double)PyLong_BASE));
for (;;) {
twodigits next = convmax * base;
if (next > PyLong_BASE) {
break;
}
convmax = next;
++i;
}
convmultmax_base[base] = convmax;
assert(i > 0);
convwidth_base[base] = i;
}
/* Find length of the string of numeric characters. */
scan = str;
lastdigit = str;
while (_PyLong_DigitValue[Py_CHARMASK(*scan)] < base || *scan == '_') {
if (*scan == '_') {
if (prev == '_') {
/* Only one underscore allowed. */
str = lastdigit + 1;
goto onError;
}
}
else {
++digits;
lastdigit = scan;
}
prev = *scan;
++scan;
}
if (prev == '_') {
/* Trailing underscore not allowed. */
/* Set error pointer to first underscore. */
str = lastdigit + 1;
goto onError;
}
/* Create an int object that can contain the largest possible
* integer with this base and length. Note that there's no
* need to initialize z->ob_digit -- no slot is read up before
* being stored into.
*/
size_z = (Py_ssize_t)(digits * log_base_BASE[base]) + 1;
/* Uncomment next line to test exceedingly rare copy code */
/* size_z = 1; */
assert(size_z > 0);
z = _PyLong_New(size_z);
if (z == NULL) {
return NULL;
}
Py_SIZE(z) = 0;
/* `convwidth` consecutive input digits are treated as a single
* digit in base `convmultmax`.
*/
convwidth = convwidth_base[base];
convmultmax = convmultmax_base[base];
/* Work ;-) */
while (str < scan) {
if (*str == '_') {
str++;
continue;
}
/* grab up to convwidth digits from the input string */
c = (digit)_PyLong_DigitValue[Py_CHARMASK(*str++)];
for (i = 1; i < convwidth && str != scan; ++str) {
if (*str == '_') {
continue;
}
i++;
c = (twodigits)(c * base +
(int)_PyLong_DigitValue[Py_CHARMASK(*str)]);
assert(c < PyLong_BASE);
}
convmult = convmultmax;
/* Calculate the shift only if we couldn't get
* convwidth digits.
*/
if (i != convwidth) {
convmult = base;
for ( ; i > 1; --i) {
convmult *= base;
}
}
/* Multiply z by convmult, and add c. */
pz = z->ob_digit;
pzstop = pz + Py_SIZE(z);
for (; pz < pzstop; ++pz) {
c += (twodigits)*pz * convmult;
*pz = (digit)(c & PyLong_MASK);
c >>= PyLong_SHIFT;
}
/* carry off the current end? */
if (c) {
assert(c < PyLong_BASE);
if (Py_SIZE(z) < size_z) {
*pz = (digit)c;
++Py_SIZE(z);
}
else {
PyLongObject *tmp;
/* Extremely rare. Get more space. */
assert(Py_SIZE(z) == size_z);
tmp = _PyLong_New(size_z + 1);
if (tmp == NULL) {
Py_DECREF(z);
return NULL;
}
memcpy(tmp->ob_digit,
z->ob_digit,
sizeof(digit) * size_z);
Py_DECREF(z);
z = tmp;
z->ob_digit[size_z] = (digit)c;
++size_z;
}
}
}
}
if (z == NULL) {
return NULL;
}
if (error_if_nonzero) {
/* reset the base to 0, else the exception message
doesn't make too much sense */
base = 0;
if (Py_SIZE(z) != 0) {
goto onError;
}
/* there might still be other problems, therefore base
remains zero here for the same reason */
}
if (str == start) {
goto onError;
}
if (sign < 0) {
Py_SIZE(z) = -(Py_SIZE(z));
}
while (*str && Py_ISSPACE(Py_CHARMASK(*str))) {
str++;
}
if (*str != '\0') {
goto onError;
}
long_normalize(z);
z = maybe_small_long(z);
if (z == NULL) {
return NULL;
}
if (pend != NULL) {
*pend = (char *)str;
}
return (PyObject *) z;
onError:
if (pend != NULL) {
*pend = (char *)str;
}
Py_XDECREF(z);
slen = strlen(orig_str) < 200 ? strlen(orig_str) : 200;
strobj = PyUnicode_FromStringAndSize(orig_str, slen);
if (strobj == NULL) {
return NULL;
}
PyErr_Format(PyExc_ValueError,
"invalid literal for int() with base %d: %.200R",
base, strobj);
Py_DECREF(strobj);
return NULL;
}
整数对象的数值操作
前面提到的数值对象的类型中tp_as_number
指向对该对象的数值操作,在Objects/longobject.c
的PyLong_Type
中定义着:
PyTypeObject PyLong_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
"int", /* tp_name */
...
&long_as_number, /* tp_as_number */
...
};
long_as_number
也在Objects/longobject.c
中定义着:
static PyNumberMethods long_as_number = {
(binaryfunc)long_add, /*nb_add*/
(binaryfunc)long_sub, /*nb_subtract*/
(binaryfunc)long_mul, /*nb_multiply*/
long_mod, /*nb_remainder*/
long_divmod, /*nb_divmod*/
long_pow, /*nb_power*/
(unaryfunc)long_neg, /*nb_negative*/
(unaryfunc)long_long, /*tp_positive*/
(unaryfunc)long_abs, /*tp_absolute*/
(inquiry)long_bool, /*tp_bool*/
(unaryfunc)long_invert, /*nb_invert*/
long_lshift, /*nb_lshift*/
(binaryfunc)long_rshift, /*nb_rshift*/
long_and, /*nb_and*/
long_xor, /*nb_xor*/
long_or, /*nb_or*/
long_long, /*nb_int*/
0, /*nb_reserved*/
long_float, /*nb_float*/
0, /* nb_inplace_add */
0, /* nb_inplace_subtract */
0, /* nb_inplace_multiply */
0, /* nb_inplace_remainder */
0, /* nb_inplace_power */
0, /* nb_inplace_lshift */
0, /* nb_inplace_rshift */
0, /* nb_inplace_and */
0, /* nb_inplace_xor */
0, /* nb_inplace_or */
long_div, /* nb_floor_divide */
long_true_divide, /* nb_true_divide */
0, /* nb_inplace_floor_divide */
0, /* nb_inplace_true_divide */
long_long, /* nb_index */
};
整数加法
static PyObject *
long_add(PyLongObject *a, PyLongObject *b)
{
PyLongObject *z;
CHECK_BINOP(a, b);
if (Py_ABS(Py_SIZE(a)) <= 1 && Py_ABS(Py_SIZE(b)) <= 1) {
return PyLong_FromLong(MEDIUM_VALUE(a) + MEDIUM_VALUE(b));
}
if (Py_SIZE(a) < 0) {
if (Py_SIZE(b) < 0) {
z = x_add(a, b);
if (z != NULL) {
/* x_add received at least one multiple-digit int,
and thus z must be a multiple-digit int.
That also means z is not an element of
small_ints, so negating it in-place is safe. */
assert(Py_REFCNT(z) == 1);
Py_SIZE(z) = -(Py_SIZE(z));
}
}
else
z = x_sub(b, a);
}
else {
if (Py_SIZE(b) < 0)
z = x_sub(a, b);
else
z = x_add(a, b);
}
return (PyObject *)z;
}
对于都小于一个digit
大小的两数相加,直接进行简单加法计算。整数过大,就分别由x_add
和x_sub
两个函数来计算
/* Add the absolute values of two integers. */
static PyLongObject *
x_add(PyLongObject *a, PyLongObject *b)
{
Py_ssize_t size_a = Py_ABS(Py_SIZE(a)), size_b = Py_ABS(Py_SIZE(b));
PyLongObject *z;
Py_ssize_t i;
digit carry = 0;
/* Ensure a is the larger of the two: */
if (size_a < size_b) {
{ PyLongObject *temp = a; a = b; b = temp; }
{ Py_ssize_t size_temp = size_a;
size_a = size_b;
size_b = size_temp; }
}
z = _PyLong_New(size_a+1);
if (z == NULL)
return NULL;
for (i = 0; i < size_b; ++i) {
carry += a->ob_digit[i] + b->ob_digit[i];
z->ob_digit[i] = carry & PyLong_MASK;
carry >>= PyLong_SHIFT;
}
for (; i < size_a; ++i) {
carry += a->ob_digit[i];
z->ob_digit[i] = carry & PyLong_MASK;
carry >>= PyLong_SHIFT;
}
z->ob_digit[i] = carry;
return long_normalize(z);
}
x_add
进行的PyLongObject
对象的加法运算也是比较简单的。先申请size_a+1
(保证a > b
)大小的内存空间(防止进位溢出),从ob_digit
的低位开始按位相加,carry
做进位处理,另外保证ob_digit[i]
中的数值在[0, PyLong_MASK]
之间,然后处理a
对象的高位数字,最后利用long_normalize
函数调整ob_size
大小,保证了ob_digit[abs(ob_size)-1]
不为零。
/* Normalize (remove leading zeros from) an int object.
Doesn't attempt to free the storage--in most cases, due to the nature
of the algorithms used, this could save at most be one word anyway. */
static PyLongObject *
long_normalize(PyLongObject *v)
{
Py_ssize_t j = Py_ABS(Py_SIZE(v));
Py_ssize_t i = j;
while (i > 0 && v->ob_digit[i-1] == 0)
--i;
if (i != j)
Py_SIZE(v) = (Py_SIZE(v) < 0) ? -(i) : i;
return v;
}
整数乘法
static PyObject *
long_mul(PyLongObject *a, PyLongObject *b)
{
PyLongObject *z;
CHECK_BINOP(a, b);
/* fast path for single-digit multiplication */
if (Py_ABS(Py_SIZE(a)) <= 1 && Py_ABS(Py_SIZE(b)) <= 1) {
stwodigits v = (stwodigits)(MEDIUM_VALUE(a)) * MEDIUM_VALUE(b);
return PyLong_FromLongLong((long long)v);
}
z = k_mul(a, b);
/* Negate if exactly one of the inputs is negative. */
if (((Py_SIZE(a) ^ Py_SIZE(b)) < 0) && z) {
_PyLong_Negate(&z);
if (z == NULL)
return NULL;
}
return (PyObject *)z;
}
对于都小于一个digit
大小的两数乘积,直接进行简单相乘,而后用PyLong_FromLongLong((long long)v)
进行调整。整数过大则使用k_mul
函数计算,k_mul
函数是Karatsuba multiplication
算法的实现,一种快速相乘算法。
转载地址:Python中的整数对象