【学习笔记】构建小型证券知识图谱(github项目)
感觉这个项目是原作者的一个作业,原作者将它分享到github上了。他构建知识图谱的重点在爬取网页数据和处理数据上,处理好数据以后直接导入neo4j生成知识图谱了。
·
前言
学习github上的项目代码,熟悉知识图谱构建流程。
知识图谱数据源
- 一部分数据爬取自网站
- 另一部分数据直接使用开源的Tushare,这部分数据可以直接构建实体
抽取网页数据代码
在原项目代码上修改了几处,加了一些注释。
import os
import csv
from lxml import etree
def extract(stockpage_dir, directors_csv):
"""Extract executive of the comnpany or stock
Args:
stockpage_dir: (str) the directory of stock pages
executive_csv: (str) the full path of the CSV file to be saved
"""
pages = map(lambda _: os.path.join(stockpage_dir, _), os.listdir(stockpage_dir))
# os.listdir 获取指定目录中的所有文件名,返回文件名列表,无序
# lambda表达式是一个匿名函数,适合与map、filter等函数结合使用
# os.path.join 拼接文件路径,正常情况下拼接符是单斜杠,但在此处拼接符是双斜杠 "./data/stockpage\\000001.html"
# map()函数接收两个参数,一个是函数,一个是Iterable,map将传入的函数依次作用到序列的每一个元素,并把结果作为新的Iterable返回
pages = filter(lambda _: _.endswith('html'), pages) # 过滤掉非html文件,返回由.html文件组成的新list
headers = ['name', 'gender', 'age', 'code', 'jobs']
with open(directors_csv, 'w', encoding='utf-8',newline="") as file_directors:
file_directors_csv = csv.DictWriter(file_directors, headers) # 创建DictWriter对象
file_directors_csv.writeheader() # 写入表头
for page in pages:
print(page) # the full path of a stock page
file_name = page.split(r'/')[-1] # [".","data","stockpage\\×××.html"],file_name取值为
code = file_name.split('.')[0] # ["stockpage\×××","html"]
code=code.split('\\')[-1] # 转义符\
executives = []
with open(page, 'r', encoding='gbk') as file_page:
content = file_page.read()
html = etree.HTML(content) # etree.HTML用来解析字符串格式的HTML文档对象,将传进去的字符串转变成_Element对象。作为_Element对象,可以方便的使用getparent()、remove()、xpath()等方法。
# 如果想通过xpath获取html源码中的内容,就要先将html源码转换成_Element对象,然后再使用xpath()方法进行解析。
divs = html.xpath('//div[@id="ml_001"]//div[contains(@class, "person_table")]') # xpath返回董事链接源码包含的元素列表
for div in divs:
item = {}
item['name'] = div.xpath('.//thead/tr/td/h3/a/text()')[0].replace(',', '-')
item['jobs'] = div.xpath('.//thead/tr[1]/td[2]/text()')[0].replace(',', '/')
gender_age_education = div.xpath('.//thead/tr[2]/td[1]/text()')[0].split()
try:
item['gender'] = gender_age_education[0]
if item['gender'] not in ('男', '女'):
item['gender'] = 'null' # null for unknown
except IndexError:
item['gender'] = 'null'
try:
item['age'] = gender_age_education[1].strip('岁')
try:
item['age'] = int(item['age'])
except ValueError:
item['age'] = -1 # -1 for unknown
except IndexError:
item['age'] = -1
item['code'] = code
executives.append(item)
# write to csv file
file_directors_csv.writerows(executives)
if __name__ == '__main__':
stockpage_dir = './data/stockpage'
directors_csv = './data/executive_prep.csv'
extract(stockpage_dir, directors_csv)
上述代码利用xpath定位数据源网页文件元素,抽取出实体信息存储在csv文件中,与用requests、bs4包爬取网页信息方法不同。但如果想自己写爬虫代码的话,需要熟悉python语言,并且懂一点html相关的知识。抽取数据截图如下:
数据处理
从网站上爬取的数据虽然存储为csv文件中,但是需要处理,将每种实体单独存在一个csv文件中,这样在neo4j中可以直接生成实体比较方便,实体之间的关系可以抽取出来存在csv中,也可以利用py2neo.NodeMatcher查询实体,再利用Relationship生成关系。
在原项目上为mode为写模式的open函数添加了一个newline参数,这样生成的csv文件中不会包含多余的空行。
import os
import csv
import hashlib
# 加密算法主要用来生成实体id
def get_md5(string):
"""Get md5 according to the string
"""
byte_string = string.encode("utf-8")
md5 = hashlib.md5() # 定义md5对象,用hashlib的md5算法加密数据
md5.update(byte_string) # 对字符串进行加密
result = md5.hexdigest() # 加密字符串
return result
# 生成Person实体对应的csv文件
def build_executive(executive_prep, executive_import):
"""Create an 'executive' file in csv format that can be imported into Neo4j.
format -> person_id:ID,name,gender,age:int,:LABEL
label -> Person
"""
print('Writing to {} file...'.format(executive_import.split('/')[-1]))
with open(executive_prep, 'r', encoding='utf-8') as file_prep, \
open(executive_import, 'w', encoding='utf-8',newline="") as file_import:
file_prep_csv = csv.reader(file_prep, delimiter=',') # 返回reader对象,可以遍历csv中的行,从csv文件中读取的每一行都以字符串列表的形式返回。
file_import_csv = csv.writer(file_import, delimiter=',') # 创建writer对象,传入列表对象
headers = ['person_id:ID', 'name', 'gender', 'age:int', ':LABEL']
file_import_csv.writerow(headers) # 将列表中的元素依次附加在 csv 文件的下一行
for i, row in enumerate(file_prep_csv):
if i == 0 or len(row) < 3: # 跳过表头和有缺失值的行
continue
info = [row[0], row[1], row[2]]
# generate md5 according to 'name' 'gender' and 'age'
info_id = get_md5('{},{},{}'.format(row[0], row[1], row[2]))
info.insert(0, info_id)
info.append('Person') #[加密字符串、name、gender、age、"Person"]
file_import_csv.writerow(info)
print('- done.')
# 生成Company实体对应的csv文件
def build_stock(stock_industry_prep, stock_concept_prep, stock_import):
"""Create an 'stock' file in csv format that can be imported into Neo4j.
format -> company_id:ID,name,code,:LABEL
label -> Company,ST
"""
print('Writing to {} file...'.format(stock_import.split('/')[-1]))
stock = set() # 'code,name'
# 抽取股票代码和公司名称
with open(stock_industry_prep, 'r', encoding='utf-8') as file_prep:
file_prep_csv = csv.reader(file_prep, delimiter=',')
for i, row in enumerate(file_prep_csv):
if i == 0: # 跳过表头
continue
code_name = '{},{}'.format(row[0], row[1].replace(' ', ''))
stock.add(code_name)
# 抽取股票代码和股票名称
with open(stock_concept_prep, 'r', encoding='utf-8') as file_prep:
file_prep_csv = csv.reader(file_prep, delimiter=',')
for i, row in enumerate(file_prep_csv):
if i == 0:
continue
code_name = '{},{}'.format(row[0], row[1].replace(' ', ''))
stock.add(code_name)
with open(stock_import, 'w', encoding='utf-8',newline="") as file_import:
file_import_csv = csv.writer(file_import, delimiter=',')
headers = ['stock_id:ID', 'name', 'code', ':LABEL']
file_import_csv.writerow(headers) # 写入表头
for s in stock:
split = s.split(',')
ST = False # ST flag,标识股票亏损状态
states = ['*ST', 'ST', 'S*ST', 'SST']
info = []
for state in states:
if split[1].startswith(state): # 名票名称前缀中包含ST标识
ST = True
split[1] = split[1].replace(state, '')
info = [split[0], split[1], split[0], 'Company;ST']
break
else:
info = [split[0], split[1], split[0], 'Company']
file_import_csv.writerow(info)
print('- done.')
# 生成Concept实体对应的csv文件
def build_concept(stock_concept_prep, concept_import):
"""Create an 'concept' file in csv format that can be imported into Neo4j.
format -> concept_id:ID,name,:LABEL
label -> Concept
"""
print('Writing to {} file...'.format(concept_import.split('/')[-1]))
with open(stock_concept_prep, 'r', encoding='utf-8') as file_prep, \
open(concept_import, 'w', encoding='utf-8',newline="") as file_import:
file_prep_csv = csv.reader(file_prep, delimiter=',')
file_import_csv = csv.writer(file_import, delimiter=',')
headers = ['concept_id:ID', 'name', ':LABEL']
file_import_csv.writerow(headers)
concepts = set()
for i, row in enumerate(file_prep_csv):
if i == 0:
continue
concepts.add(row[2])
for concept in concepts:
concept_id = get_md5(concept)
new_row = [concept_id, concept, 'Concept']
file_import_csv.writerow(new_row)
print('- done.')
# 生成Industry实体对应的csv文件
def build_industry(stock_industry_prep, industry_import):
"""Create an 'industry' file in csv format that can be imported into Neo4j.
format -> industry_id:ID,name,:LABEL
label -> Industry
"""
print('Write to {} file...'.format(industry_import.split('/')[-1]))
with open(stock_industry_prep, 'r', encoding="utf-8") as file_prep, \
open(industry_import, 'w', encoding='utf-8',newline="") as file_import:
file_prep_csv = csv.reader(file_prep, delimiter=',')
file_import_csv = csv.writer(file_import, delimiter=',')
headers = ['industry_id:ID', 'name', ':LABEL']
file_import_csv.writerow(headers)
industries = set()
for i, row in enumerate(file_prep_csv):
if i == 0:
continue
industries.add(row[2])
for industry in industries:
industry_id = get_md5(industry)
new_row = [industry_id, industry, 'Industry']
file_import_csv.writerow(new_row)
print('- done.')
# 生成 employ_of 关系对应的 csv 文件,title 是 employ_of 关系的属性
def build_executive_stock(executive_prep, relation_import):
"""Create an 'executive_stock' file in csv format that can be imported into Neo4j.
format -> :START_ID,title,:END_ID,:TYPE
person stock
type -> employ_of
"""
with open(executive_prep, 'r', encoding='utf-8') as file_prep, \
open(relation_import, 'w', encoding='utf-8',newline="") as file_import:
file_prep_csv = csv.reader(file_prep, delimiter=',')
file_import_csv = csv.writer(file_import, delimiter=',')
headers = [':START_ID', 'jobs', ':END_ID', ':TYPE']
file_import_csv.writerow(headers)
for i, row in enumerate(file_prep_csv):
if i == 0:
continue
# generate md5 according to 'name' 'gender' and 'age',作为START_ID
start_id = get_md5('{},{},{}'.format(row[0], row[1], row[2]))
end_id = row[3] # code
relation = [start_id, row[4], end_id, 'employ_of']
file_import_csv.writerow(relation)
# 生成industry_of关系对应的csv文件,这个关系没有属性
def build_stock_industry(stock_industry_prep, relation_import):
"""Create an 'stock_industry' file in csv format that can be imported into Neo4j.
format -> :START_ID,:END_ID,:TYPE
stock industry
type -> industry_of
"""
with open(stock_industry_prep, 'r', encoding='utf-8') as file_prep, \
open(relation_import, 'w', encoding='utf-8',newline="") as file_import:
file_prep_csv = csv.reader(file_prep, delimiter=',')
file_import_csv = csv.writer(file_import, delimiter=',')
headers = [':START_ID', ':END_ID', ':TYPE']
file_import_csv.writerow(headers)
for i, row in enumerate(file_prep_csv):
if i == 0:
continue
industry = row[2]
start_id = row[0] # code
end_id = get_md5(industry)
relation = [start_id, end_id, 'industry_of']
file_import_csv.writerow(relation)
# 生成 concept_of 关系对应的csv文件,该关系同样没有属性
def build_stock_concept(stock_concept_prep, relation_import):
"""Create an 'stock_industry' file in csv format that can be imported into Neo4j.
format -> :START_ID,:END_ID,:TYPE
stock concept
type -> concept_of
"""
with open(stock_concept_prep, 'r', encoding='utf-8') as file_prep, \
open(relation_import, 'w', encoding='utf-8',newline="") as file_import:
file_prep_csv = csv.reader(file_prep, delimiter=',')
file_import_csv = csv.writer(file_import, delimiter=',')
headers = [':START_ID', ':END_ID', ':TYPE']
file_import_csv.writerow(headers)
for i, row in enumerate(file_prep_csv):
if i == 0:
continue
concept = row[2]
start_id = row[0] # code
end_id = get_md5(concept)
relation = [start_id, end_id, 'concept_of']
file_import_csv.writerow(relation)
if __name__ == '__main__':
import_path = 'data1/import'
if not os.path.exists(import_path):
os.makedirs(import_path)
build_executive('data1/executive_prep.csv', 'data1/import/executive.csv')
build_stock('data1/stock_industry_prep.csv', 'data1/stock_concept_prep.csv',
'data1/import/stock.csv')
build_concept('data1/stock_concept_prep.csv', 'data1/import/concept.csv')
build_industry('data1/stock_industry_prep.csv', 'data1/import/industry.csv')
build_executive_stock('data1/executive_prep.csv', 'data1/import/executive_stock.csv')
build_stock_industry('data1/stock_industry_prep.csv', 'data1/import/stock_industry.csv')
build_stock_concept('data1/stock_concept_prep.csv', 'data1/import/stock_concept.csv')
运行结果如下:
构建知识图谱
原作者直接将抽取好的实体和关系csv文件导入neo4j生成知识图谱,注意csv文件的存储路径就行。
思考
构建人的实体时,重名问题具体怎么解决?
总结
感觉这个项目是原作者的一个作业,原作者将它分享到github上了。他构建知识图谱的重点在爬取网页数据和处理数据上,处理好数据以后直接导入neo4j生成知识图谱了。
更多推荐
所有评论(0)