bilibili视频讲解:https://space.bilibili.com/431392724
b站用户名:平凡的久月
1. PyBytesObject
变长对象(数据长度在定义时是不知道的,只能在创建时才能确定)
不可变对象(改变值内存地址会发生改变)
1.1 定义
// Include/bytesobject.h
#ifndef Py_LIMITED_API
typedef struct {
PyObject_VAR_HEAD
Py_hash_t ob_shash;
char ob_sval[1];
/* Invariants:
* ob_sval contains space for 'ob_size+1' elements.
* ob_sval[ob_size] == 0.
* ob_shash is the hash of the string or -1 if not computed yet.
*/
} PyBytesObject;
#endif
// python3 中不再使用 PyBytesObject 作为 String 类的底层实现
#define PyObject_VAR_HEAD PyVarObject ob_base;
typedef struct {
PyObject ob_base;
Py_ssize_t ob_size; /* Number of items in variable part */
} PyVarObject;
typedef struct _object {
_PyObject_HEAD_EXTRA
Py_ssize_t ob_refcnt;
struct _typeobject *ob_type;
} PyObject;
// 等价于下列表达
typedef struct {
Py_hash_t ob_shash; // 缓存该对象的hash值,避免重新计算(初始值-1),dict中详细解释作用
// PyBytesObject内部维护的字符串必须以'\0'结尾
char ob_sval[1]; // 字符指针,指向ob_size+1个字节的内存'\0'
Py_ssize_t ob_size;
Py_ssize_t ob_refcnt;
struct _typeobject *ob_type;
} PyBytesObject;
#endif
PyTypeObject PyBytes_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
"bytes",
PyBytesObject_SIZE, // ob_size
sizeof(char), // 一个字节
bytes_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_reserved */
(reprfunc)bytes_repr, /* tp_repr */
// 支持三种操作
&bytes_as_number, /* tp_as_number */
&bytes_as_sequence, /* tp_as_sequence */
&bytes_as_mapping, /* tp_as_mapping */
(hashfunc)bytes_hash, /* tp_hash */
// ......
0, /* tp_init */
0, /* tp_alloc */
bytes_new, /* tp_new */
PyObject_Del, /* tp_free */
};
1.2 创建PyBytesObject
Python提供了多种路径从C中原生的字符串创建PyBytesObject对象
-
PyBytes_FromString
PyObject * PyBytes_FromString(const char *str) { size_t size; PyBytesObject *op; assert(str != NULL); size = strlen(str); // 判断字符串长度是否超过系统寻址能力 if (size > PY_SSIZE_T_MAX - PyBytesObject_SIZE) { PyErr_SetString(PyExc_OverflowError, "byte string is too long"); return NULL; } // 处理空串:通过nullstring,始终只有一个 if (size == 0 && (op = nullstring) != NULL) { #ifdef COUNT_ALLOCS null_strings++; #endif Py_INCREF(op); return (PyObject *)op; } // 处理单个字符(字符串对象缓冲池) if (size == 1 && (op = characters[*str & UCHAR_MAX]) != NULL) { #ifdef COUNT_ALLOCS one_strings++; #endif Py_INCREF(op); return (PyObject *)op; } // 创建新的PyBytesObject对象,并初始化 /* Inline PyObject_NewVar */ op = (PyBytesObject *)PyObject_MALLOC(PyBytesObject_SIZE + size); if (op == NULL) return PyErr_NoMemory(); (void)PyObject_INIT_VAR(op, &PyBytes_Type, size); op->ob_shash = -1; memcpy(op->ob_sval, str, size+1); /* share short strings */ if (size == 0) { nullstring = op; Py_INCREF(op); } else if (size == 1) { characters[*str & UCHAR_MAX] = op; Py_INCREF(op); } return (PyObject *) op; }
python3中没有ob_sstate这个变量!!!
-
PyBytes_FromStringAndSize
PyObject * PyBytes_FromStringAndSize(const char *str, Py_ssize_t size) { PyBytesObject *op; if (size < 0) { PyErr_SetString(PyExc_SystemError, "Negative size passed to PyBytes_FromStringAndSize"); return NULL; } // 单个字符 if (size == 1 && str != NULL && (op = characters[*str & UCHAR_MAX]) != NULL) { #ifdef COUNT_ALLOCS one_strings++; #endif Py_INCREF(op); return (PyObject *)op; } // 创建新的PyBytesObject对象,并初始化 op = (PyBytesObject *)_PyBytes_FromSize(size, 0); if (op == NULL) return NULL; if (str == NULL) return (PyObject *) op; memcpy(op->ob_sval, str, size); /* share short strings */ if (size == 1) { characters[*str & UCHAR_MAX] = op; Py_INCREF(op); } return (PyObject *) op; }
1.3 intern机制
1.3.1 Python2中
Python2中通过PyString_InternInPlace实现intern机制
检查两项内容:
(1)是否是PyBytesObject?
(2)是否被intern机制处理过(保证只处理一次)
void
PyString_InternInPlace(PyObject **p)
{
register PyStringObject *s = (PyStringObject *)(*p);
PyObject *t;
if (s == NULL || !PyString_Check(s))
Py_FatalError("PyString_InternInPlace: strings only please!");
/* If it's a string subclass, we don't really know what putting
it in the interned dict might do. */
if (!PyString_CheckExact(s))
return;
if (PyString_CHECK_INTERNED(s))
return;
if (interned == NULL) {
interned = PyDict_New();
if (interned == NULL) {
PyErr_Clear(); /* Don't leave an exception */
return;
}
}
t = PyDict_GetItem(interned, (PyObject *)s);
if (t) {
Py_INCREF(t);
Py_SETREF(*p, t);
return;
}
if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
PyErr_Clear();
return;
}
/* The two references in interned are not counted by refcnt.
The string deallocator will take care of this */
Py_REFCNT(s) -= 2;
PyString_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
}
-
intern是什么?PyDictObject对象–interned
-
创建临时变量a,在intern中寻找是否有一样的对象。
- intern中的指针不作为a的有效引用(不然a永远无法销毁)
细节问题:
(1)为什么创建的时候interned的键与值都设置为对象的PyObject指针?
(2)为什么将对象的引用计数减2?
1.3.2 Python3中
Python3中做了修改,移动到了sys库,编译器默认执行
from six.moves import intern
from sys import intern
str = "shanghai"
print(intern(str).__doc__)
// Modules/pyexpat.c
static PyObject*
string_intern(xmlparseobject *self, const char* str)
{
PyObject *result = conv_string_to_unicode(str);
PyObject *value;
/* result can be NULL if the unicode conversion failed. */
if (!result)
return result;
if (!self->intern)
return result;
value = PyDict_GetItem(self->intern, result);
if (!value) {
if (PyDict_SetItem(self->intern, result, result) == 0)
return result;
else
return NULL;
}
Py_INCREF(value);
Py_DECREF(result);
return value;
}
1.4 字符串缓冲池
一个字节的字符对应的对象缓冲池
static PyBytesObject *characters[UCHAR_MAX + 1];
static PyBytesObject *nullstring;
实现过程
(1)创建PyBytesObject对象
(2)进行intern操作
(3)缓存进缓冲池
1.5 与效率相关的问题
背景:实现100个字符串的拼接
实现方法:+
问题:创建N-1个对象,进行N-1次内存的申请与释放
根本原因:不可变对象
解决方法:对存储在list的一组对象进行连接操作(join)
一次申请N个对象使用的内存,并统计这些对象维护的字符串有多长,然后申请内存,最后拷贝到内存空间。
a = 345
b = a
c = 456
d = 456
print(a is b)
print(a is c)
print(c is d)
e = "abc"
f = "abc"
g = "abd"
print(e is f)
print(e is g)
# True
# False
# True
# True
# False