利用Python实现json格式转换为csv文件格式
前言
本文是学校的课程设计,这里我没有用封装好的json库来实现,而是把读进来的文件当一个字符串来处理,核心函数其实是python的eval()类型转换函数。
什么是 JSON?
我们要考虑到json格式下key-value对的结构是无序的。JSON:JavaScript Object Notation(JavaScript 对象表示法),是存储和交换文本信息的语法。有点类似 XML,JSON 比 XML 更小、更快,更易解析,同时删除了不必要的元素信息。
读取文件
首先读取文件数据到data_str, 这里注意文件编码,否则会出现乱码。
with open('final.json', 'r', encoding='UTF-8') as file_in:
data_str = file_in.read()
观察一下读取的json内容
print("字符串头: ", data_str[:100])
print("字符串尾: ", data_str[-100:])
字符串头: {
"data":{
"currentTime":1546510953211,
"pageCount":4,
"page":3,
字符串尾: "brandId":157402,
"isXbProduct":1
}
]
},
"code":"0"
}
json文件格式
- 对象可以包含多个 key/value(键/值)对。
- key 必须是字符串,value 可以是合法的 JSON 数据类型(字符串, 数字, 对象, 数组, 布尔值或 null)。
- key 和 value 中使用冒号(:)分割。
- 每个 key/value 对使用逗号(,)分割。
替换元素
对json文件做一个替换,这里这样替换是有问题的,因为从前向后扫描的时候只有在引号外的元素才需要替换,可以对扫过的引号数量取模实现。
data_str = data_str.replace('true','True')
data_str = data_str.replace('false','False')
data_str = data_str.replace('null','None')
data_str = data_str.replace('\n','')
类型函数
判断当前的字符串属于哪个类型
def query_type(data):
for ch in data:
if ch == '{':
return "dict"
if ch == '[':
return "list"
return "value"
head函数的实现
遍历获得json的所有key作为csv的表头
data_head = [] # 这里要保证有序,可以用有序字典,映射的时候可以用dict
def get_json_head(data, loc=""):
data = str(data) # 将数据转换成字符串
data_type = query_type(data)
if data_type == "value": # 如果是元素
if loc[1:] not in data_head:
data_head.append(loc[1:])
return
if data_type == "dict": # 如果是字典
data_dict = eval(data)
for key in data_dict:
get_json_head(data_dict[key], loc + "_" + key)
return
if data_type == "list": # 如果是列表
data_list = list(eval(data))
for item in data_list:
get_json_head(item, loc)
return
get_json_head(data_str)
print(data_head[:10])
print(data_head[-10:])
print("表头数量:", len(data_head))
['data_currentTime', 'data_pageCount', 'data_page', 'data_pageSize', 'data_isNoStock', 'data_success', 'data_wareInfoList_name', 'data_wareInfoList_skuId', 'data_wareInfoList_imageUrl', 'data_wareInfoList_price']
['data_wareInfoList_seckillPromotion_minAmount', 'data_wareInfoList_seckillPromotion_maxAmount', 'data_wareInfoList_seckillPromotion_price', 'data_wareInfoList_seckillPromotion_promotionPrice', 'data_wareInfoList_promotionList_skuList_adWords', 'data_wareInfoList_limitcount', 'data_wareInfoList_limitHour', 'data_wareInfoList_promotionList_limitTime', 'data_wareInfoList_seckillPromotion_limitTime', 'code']
表头数量: 116
body内容的转换
data_head_dict保存转换后CSV的body内容
data_head_dict = {}
for head in data_head:
# print(head)
tmp = []
for i in range(200):
tmp.append("")
data_head_dict[head] = tmp
宽度的拓展,这里实现的时候比较暴力。解决方案如下:
- 最后自底向上删空行
- 处理head的时候加tag记录行号
for key in data_head[:10]:
# print(key, data_head_dict[key][:10])
# print(key, len(data_head_dict[key]))
print(key, data_head_dict[key][:10], len(data_head_dict[key]))
pass
data_currentTime ['', '', '', '', '', '', '', '', '', ''] 200
data_pageCount ['', '', '', '', '', '', '', '', '', ''] 200
data_page ['', '', '', '', '', '', '', '', '', ''] 200
data_pageSize ['', '', '', '', '', '', '', '', '', ''] 200
data_isNoStock ['', '', '', '', '', '', '', '', '', ''] 200
data_success ['', '', '', '', '', '', '', '', '', ''] 200
data_wareInfoList_name ['', '', '', '', '', '', '', '', '', ''] 200
data_wareInfoList_skuId ['', '', '', '', '', '', '', '', '', ''] 200
data_wareInfoList_imageUrl ['', '', '', '', '', '', '', '', '', ''] 200
data_wareInfoList_price ['', '', '', '', '', '', '', '', '', ''] 200
row_now = 0
def get_json_table(data, loc="", rows=0):
global row_now
data = str(data) # 将数据转换成字符串
data_type = query_type(data)
if data_type == "value": # 如果是元素
key = loc[1:]
data_head_dict[key][rows] = data
return
if data_type == "dict": # 如果是字典
data_dict = eval(data)
for key in data_dict:
get_json_table(data_dict[key], loc + "_" + key, rows)
return
if data_type == "list": # 如果是列表
data_list = list(eval(data))
for i in range(len(data_list)):
if i > 0:
row_now += 1
get_json_table(data_list[i], loc, row_now)
return
get_json_table(data_str)
看一下表的内容是否正确
for key in data_head[:10]:
# print(key, data_head_dict[key][:10])
# print(key, len(data_head_dict[key]))
print(key, data_head_dict[key][:10], len(data_head_dict[key]))
pass
data_currentTime ['1546510953211', '', '', '', '', '', '', '', '', ''] 200
data_pageCount ['4', '', '', '', '', '', '', '', '', ''] 200
data_page ['3', '', '', '', '', '', '', '', '', ''] 200
data_pageSize ['10', '', '', '', '', '', '', '', '', ''] 200
data_isNoStock ['0', '', '', '', '', '', '', '', '', ''] 200
data_success ['True', '', '', '', '', '', '', '', '', ''] 200
data_wareInfoList_name ['【自营】【整箱】京华门 北京二锅头酒绿瓶 56度 清香风格 500ml*12瓶(产品包装升级随机发货)', '【自营】【整箱】喜力(Heineken)啤酒500ml*12听', '', '', '', '', '', '', '', ''] 200
data_wareInfoList_skuId ['100000388367', '7120050', '', '', '', '', '', '', '', ''] 200
data_wareInfoList_imageUrl ['//img10.360buyimg.com/n7/jfs/t1/8619/15/8670/256329/5c0f87a6E64dac15c/a1e17b14bb60ddc0.jpg', '//img10.360buyimg.com/n7/jfs/t18775/305/1764241768/131776/ab95ffad/5ad70385N5373f0dc.jpg', '', '', '', '', '', '', '', ''] 200
data_wareInfoList_price ['89.00', '86.00', '', '', '', '', '', '', '', ''] 200
输出CSV文件
with open('final_gbk.csv', 'w',encoding="gbk") as file_out:
for head in data_head[:-1]:
file_out.write(head)
file_out.write(",") # 逗号分隔
file_out.write(data_head[-1] + "\n") # 最后一个换行
for i in range(200):
for head in data_head[:-1]:
file_out.write(data_head_dict[head][i])
file_out.write(",")
last_key = data_head[-1] # 取最后一个head
file_out.write(data_head_dict[last_key][i])
file_out.write("\n")
以utf-8编码格式输出文件
with open('final_utf8.csv', 'w',encoding="utf-8") as file_out:
for head in data_head[:-1]:
file_out.write(head)
file_out.write(",") # 逗号分隔
file_out.write(data_head[-1] + "\n") # 最后一个换行
for i in range(200):
for head in data_head[:-1]:
file_out.write(data_head_dict[head][i])
file_out.write(",")
last_key = data_head[-1] # 取最后一个head
file_out.write(data_head_dict[last_key][i])
file_out.write("\n")
import pandas as pd
csv_data = pd.read_csv('final_utf8.csv')
csv_data.head(5)
data_currentTime | data_pageCount | data_page | data_pageSize | data_isNoStock | data_success | data_wareInfoList_name | data_wareInfoList_skuId | data_wareInfoList_imageUrl | data_wareInfoList_price | ... | data_wareInfoList_seckillPromotion_minAmount | data_wareInfoList_seckillPromotion_maxAmount | data_wareInfoList_seckillPromotion_price | data_wareInfoList_seckillPromotion_promotionPrice | data_wareInfoList_promotionList_skuList_adWords | data_wareInfoList_limitcount | data_wareInfoList_limitHour | data_wareInfoList_promotionList_limitTime | data_wareInfoList_seckillPromotion_limitTime | code | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.546511e+12 | 4.0 | 3.0 | 10.0 | 0.0 | True | 【自营】【整箱】京华门 北京二锅头酒绿瓶 56度 清香风格 500ml*12瓶(产品包装升级... | 1.000004e+11 | //img10.360buyimg.com/n7/jfs/t1/8619/15/8670/2... | 89.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 |
1 | NaN | NaN | NaN | NaN | NaN | NaN | 【自营】【整箱】喜力(Heineken)啤酒500ml*12听 | 7.120050e+06 | //img10.360buyimg.com/n7/jfs/t18775/305/176424... | 86.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 116 columns