前言
Python3
中有六个标准的数据类型:
- Number
- String
- List
- Tuple
- Set
- Dictionary
本章主要介绍String
数据类型
String
类型对象
string
在python
中,也是一个类型对象,其定义在Python/bltinmodule.c
文件中:
...
SETBUILTIN("staticmethod", &PyStaticMethod_Type);
SETBUILTIN("str", &PyUnicode_Type);
SETBUILTIN("super", &PySuper_Type);
...
也就是对应着PyUnicode_Type
,其定义在Include/unicodeobject.h
:
PyTypeObject PyUnicode_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
"str", /* tp_name */
sizeof(PyUnicodeObject), /* tp_size */
0, /* tp_itemsize */
/* Slots */
(destructor)unicode_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_reserved */
unicode_repr, /* tp_repr */
&unicode_as_number, /* tp_as_number */
&unicode_as_sequence, /* tp_as_sequence */
&unicode_as_mapping, /* tp_as_mapping */
(hashfunc) unicode_hash, /* tp_hash*/
0, /* tp_call*/
(reprfunc) unicode_str, /* tp_str */
PyObject_GenericGetAttr, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
unicode_doc, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
PyUnicode_RichCompare, /* tp_richcompare */
0, /* tp_weaklistoffset */
unicode_iter, /* tp_iter */
0, /* tp_iternext */
unicode_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
&PyBaseObject_Type, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
0, /* tp_init */
0, /* tp_alloc */
unicode_new, /* tp_new */
PyObject_Del, /* tp_free */
};
字符串对象的创建
字符串对象的创建通过unicode_new
方法实现,通用定义在Include/unicodeobject.h
:
static PyObject *
unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
PyObject *x = NULL;
static char *kwlist[] = {"object", "encoding", "errors", 0};
char *encoding = NULL;
char *errors = NULL;
if (type != &PyUnicode_Type)
return unicode_subtype_new(type, args, kwds);
if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
kwlist, &x, &encoding, &errors))
return NULL;
if (x == NULL)
_Py_RETURN_UNICODE_EMPTY();
if (encoding == NULL && errors == NULL)
return PyObject_Str(x);
else
return PyUnicode_FromEncodedObject(x, encoding, errors);
}
字符的创建涉及到编码的问题,最终调用了PyUnicode_FromEncodedObject
函数:
PyObject *
PyUnicode_FromEncodedObject(PyObject *obj,
const char *encoding,
const char *errors)
{
Py_buffer buffer;
PyObject *v;
if (obj == NULL) {
PyErr_BadInternalCall();
return NULL;
}
/* Decoding bytes objects is the most common case and should be fast */
if (PyBytes_Check(obj)) {
if (PyBytes_GET_SIZE(obj) == 0)
_Py_RETURN_UNICODE_EMPTY();
v = PyUnicode_Decode(
PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
encoding, errors);
return v;
}
if (PyUnicode_Check(obj)) {
PyErr_SetString(PyExc_TypeError,
"decoding str is not supported");
return NULL;
}
/* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
PyErr_Format(PyExc_TypeError,
"decoding to str: need a bytes-like object, %.80s found",
Py_TYPE(obj)->tp_name);
return NULL;
}
if (buffer.len == 0) {
PyBuffer_Release(&buffer);
_Py_RETURN_UNICODE_EMPTY();
}
v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
PyBuffer_Release(&buffer);
return v;
}