python源码阅读-字符串

前言

Python3中有六个标准的数据类型:

  • Number
  • String
  • List
  • Tuple
  • Set
  • Dictionary

本章主要介绍String数据类型

String

类型对象

stringpython中,也是一个类型对象,其定义在Python/bltinmodule.c文件中:

1
2
3
4
5
...
SETBUILTIN("staticmethod", &PyStaticMethod_Type);
SETBUILTIN("str", &PyUnicode_Type);
SETBUILTIN("super", &PySuper_Type);
...

也就是对应着PyUnicode_Type,其定义在Include/unicodeobject.h:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
PyTypeObject PyUnicode_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
"str", /* tp_name */
sizeof(PyUnicodeObject), /* tp_size */
0, /* tp_itemsize */
/* Slots */
(destructor)unicode_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_reserved */
unicode_repr, /* tp_repr */
&unicode_as_number, /* tp_as_number */
&unicode_as_sequence, /* tp_as_sequence */
&unicode_as_mapping, /* tp_as_mapping */
(hashfunc) unicode_hash, /* tp_hash*/
0, /* tp_call*/
(reprfunc) unicode_str, /* tp_str */
PyObject_GenericGetAttr, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
unicode_doc, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
PyUnicode_RichCompare, /* tp_richcompare */
0, /* tp_weaklistoffset */
unicode_iter, /* tp_iter */
0, /* tp_iternext */
unicode_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
&PyBaseObject_Type, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
0, /* tp_init */
0, /* tp_alloc */
unicode_new, /* tp_new */
PyObject_Del, /* tp_free */
};

字符串对象的创建

字符串对象的创建通过unicode_new方法实现,通用定义在Include/unicodeobject.h:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
static PyObject *
unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
PyObject *x = NULL;
static char *kwlist[] = {"object", "encoding", "errors", 0};
char *encoding = NULL;
char *errors = NULL;

if (type != &PyUnicode_Type)
return unicode_subtype_new(type, args, kwds);
if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
kwlist, &x, &encoding, &errors))
return NULL;
if (x == NULL)
_Py_RETURN_UNICODE_EMPTY();
if (encoding == NULL && errors == NULL)
return PyObject_Str(x);
else
return PyUnicode_FromEncodedObject(x, encoding, errors);
}

字符的创建涉及到编码的问题,最终调用了PyUnicode_FromEncodedObject函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
PyObject *
PyUnicode_FromEncodedObject(PyObject *obj,
const char *encoding,
const char *errors)
{
Py_buffer buffer;
PyObject *v;

if (obj == NULL) {
PyErr_BadInternalCall();
return NULL;
}

/* Decoding bytes objects is the most common case and should be fast */
if (PyBytes_Check(obj)) {
if (PyBytes_GET_SIZE(obj) == 0)
_Py_RETURN_UNICODE_EMPTY();
v = PyUnicode_Decode(
PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
encoding, errors);
return v;
}

if (PyUnicode_Check(obj)) {
PyErr_SetString(PyExc_TypeError,
"decoding str is not supported");
return NULL;
}

/* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
PyErr_Format(PyExc_TypeError,
"decoding to str: need a bytes-like object, %.80s found",
Py_TYPE(obj)->tp_name);
return NULL;
}

if (buffer.len == 0) {
PyBuffer_Release(&buffer);
_Py_RETURN_UNICODE_EMPTY();
}

v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
PyBuffer_Release(&buffer);
return v;
}