完整正确的fpgrowth代码-python
晚上关于fpgrowth代码基本上都是错的,跑出来的结果不唯一,这里我给一份正确的fpgrowth代码。l=[[a,b,c],[,d,c,e,g],[a,e,c,e]]这样就可以了。注意处理后的数据集的形式是一个二级列表,如(parsedDat)
·
完整正确的fpgrowth代码-python
网上关于fpgrowth代码基本上都是错的,跑出来的结果不唯一,这里我给一份正确的fpgrowth代码
# coding:utf-8
class treeNode:
def __init__(self, nameValue, numOccur, parentNode):
self.name = nameValue
self.count = numOccur
self.nodeLink = None
self.parent = parentNode
self.children = {}
def inc(self, numOccur):
self.count += numOccur
def disp(self, ind=1):
print ' '*ind, self.name, ' ', self.count
for child in self.children.values():
child.disp(ind+1)
def updateHeader(nodeToTest, targetNode):
while nodeToTest.nodeLink != None:
nodeToTest = nodeToTest.nodeLink
nodeToTest.nodeLink = targetNode
def updateFPtree(items, inTree, headerTable, count):
if items[0] in inTree.children:
# 判断items的第一个结点是否已作为子结点
inTree.children[items[0]].inc(count)
else:
# 创建新的分支
inTree.children[items[0]] = treeNode(items[0], count, inTree)
if headerTable[items[0]][1] == None:
headerTable[items[0]][1] = inTree.children[items[0]]
else:
updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
# 递归
if len(items) > 1:
updateFPtree(items[1::], inTree.children[items[0]], headerTable, count)
def createFPtree(dataSet, minSup=1):
headerTable = {}
#print dataSet.keys()[0:10]
for trans in dataSet:
# print(trans)
for item in trans:
headerTable[item] = headerTable.get(item, 0) + dataSet[trans]
for k in headerTable.keys():
# print(headerTable[k])
if int(headerTable[k]) < minSup:
# print "yes",int(headerTable[k]) < minSup
del(headerTable[k]) # 删除不满足最小支持度的元素
freqItemSet = set(headerTable.keys()) # 满足最小支持度的频繁项集
if len(freqItemSet) == 0:
return None, None
for k in headerTable:
headerTable[k] = [headerTable[k], None] # element: [count, node]
retTree = treeNode('Null Set', 1, None)
for tranSet, count in dataSet.items():
# dataSet:[element, count]
localD = {}
for item in tranSet:
if item in freqItemSet: # 过滤,只取该样本中满足最小支持度的频繁项
localD[item] = headerTable[item][0] # element : count
if len(localD) > 0:
# 根据全局频数从大到小对单样本排序
# orderedItem = [v[0] for v in sorted(localD.iteritems(), key=lambda p:(p[1], -ord(p[0])), reverse=True)]
orderedItem = [v[0] for v in sorted(localD.iteritems(), key=lambda p:(p[1], int(p[0])), reverse=True)]
# 用过滤且排序后的样本更新树
updateFPtree(orderedItem, retTree, headerTable, count)
# print(headerTable)
return retTree, headerTable
# 回溯
def ascendFPtree(leafNode, prefixPath):
if leafNode.parent != None:
prefixPath.append(leafNode.name)
ascendFPtree(leafNode.parent, prefixPath)
# 条件模式基
def findPrefixPath(basePat, myHeaderTab):
treeNode = myHeaderTab[basePat][1] # basePat在FP树中的第一个结点
condPats = {}
while treeNode != None:
prefixPath = []
ascendFPtree(treeNode, prefixPath) # prefixPath是倒过来的,从treeNode开始到根
if len(prefixPath) > 1:
condPats[frozenset(prefixPath[1:])] = treeNode.count # 关联treeNode的计数
treeNode = treeNode.nodeLink # 下一个basePat结点
return condPats
def mineFPtree(inTree, headerTable, minSup, preFix, freqItemList):
# 最开始的频繁项集是headerTable中的各元素
bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p:p[1])] # 根据频繁项的总频次排序
for basePat in bigL: # 对每个频繁项
newFreqSet = preFix.copy()
newFreqSet.add(basePat)
freqItemList.append(newFreqSet)
condPattBases = findPrefixPath(basePat, headerTable) # 当前频繁项集的条件模式基
myCondTree, myHead = createFPtree(condPattBases, minSup) # 构造当前频繁项的条件FP树
if myHead != None:
# print 'conditional tree for: ', newFreqSet
# myCondTree.disp(1)
mineFPtree(myCondTree, myHead, minSup, newFreqSet, freqItemList) # 递归挖掘条件FP树
def loadSimpDat():
simDat = [['r','z','h','j','p'],
['z','y','x','w','v','u','t','s'],
['z'],
['r','x','n','o','s'],
['y','r','x','z','q','t','p'],
['y','z','x','e','q','s','t','m']]
return simDat
def createInitSet(dataSet):
retDict={}
for trans in dataSet:
key = frozenset(trans)
if retDict.has_key(key):
retDict[frozenset(trans)] += 1
else:
retDict[frozenset(trans)] = 1
return retDict
def calSuppData(headerTable, freqItemList, total):
suppData = {}
for Item in freqItemList:
# 找到最底下的结点
Item = sorted(Item, key=lambda x:headerTable[x][0])
base = findPrefixPath(Item[0], headerTable)
# 计算支持度
support = 0
for B in base:
if frozenset(Item[1:]).issubset(set(B)):
support += base[B]
# 对于根的儿子,没有条件模式基
if len(base)==0 and len(Item)==1:
support = headerTable[Item[0]][0]
suppData[frozenset(Item)] = support/float(total)
return suppData
def aprioriGen(Lk, k):
retList = []
lenLk = len(Lk)
for i in range(lenLk):
for j in range(i+1, lenLk):
L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
L1.sort(); L2.sort()
if L1 == L2:
retList.append(Lk[i] | Lk[j])
return retList
def calcConf(freqSet, H, supportData, br1, minConf=0.7):
prunedH = []
for conseq in H:
if supportData[freqSet - conseq]!=0:
conf = supportData[freqSet] / supportData[freqSet - conseq]
if conf >= minConf:
print "{0} --> {1} conf:{2}".format(freqSet - conseq, conseq, conf)
br1.append((freqSet - conseq, conseq, conf))
prunedH.append(conseq)
return prunedH
def rulesFromConseq(freqSet, H, supportData, br1, minConf=0.7):
m = len(H[0])
if len(freqSet) > m+1:
Hmp1 = aprioriGen(H, m+1)
Hmp1 = calcConf(freqSet, Hmp1, supportData, br1, minConf)
if len(Hmp1)>1:
rulesFromConseq(freqSet, Hmp1, supportData, br1, minConf)
def generateRules(freqItemList, supportData, minConf=0.7):
bigRuleList = []
for freqSet in freqItemList:
H1 = [frozenset([item]) for item in freqSet]
if len(freqSet)>1:
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
else:
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
return bigRuleList
main 函数如下:
注意处理后的数据集的形式是一个二级列表,如(parsedDat)
l=[[a,b,c],[,d,c,e,g],[a,e,c,e]]这样就可以了
import fpgrowth
import time
import data_process
# '''simple data'''
# simDat = fpgrowth.loadSimpDat()
# initSet = fpgrowth.createInitSet(simDat)
# myFPtree, myHeaderTab = fpgrowth.createFPtree(initSet, 3)
# myFPtree.disp()
# print fpgrowth.findPrefixPath('z', myHeaderTab)
# print fpgrowth.findPrefixPath('r', myHeaderTab)
# print fpgrowth.findPrefixPath('t', myHeaderTab)
# freqItems = []
# fpgrowth.mineFPtree(myFPtree, myHeaderTab, 3, set([]), freqItems)
# for x in freqItems:
# print x
#先跑一下
'''kosarak data'''
start = time.time()
n = 11#最小支持度
#C:\Users\gaoxi\source\repos\fpgrowth\fpgrowth\fpgrowth-master\data\kosarak.dat
#with open(r"C:\Users\gaoxi\source\repos\fpgrowth\fpgrowth\fpgrowth-master\data\kosarak.dat", "rb") as f:
# parsedDat = [line.split() for line in f.readlines()]
#print parsedDat
parsedDat=data_process.get_data()
initSet = fpgrowth.createInitSet(parsedDat)
myFPtree, myHeaderTab = fpgrowth.createFPtree(initSet, n)
freqItems = []
fpgrowth.mineFPtree(myFPtree, myHeaderTab, n, set([]), freqItems)
print(time.time()-start, 'sec')
# compute support values of freqItems
suppData = fpgrowth.calSuppData(myHeaderTab, freqItems, len(parsedDat))
suppData[frozenset([])] = 1.0
for x,v in suppData.iteritems():
print(x,v)
minConf=0.8
freqItems = [frozenset(x) for x in freqItems]
fpgrowth.generateRules(freqItems, suppData,minConf)
更多推荐
已为社区贡献10条内容
所有评论(0)