LR

# -*- coding: utf-8 -*-
__author__ = 'Wsine'

from numpy import *
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import operator
import time

LINE_OF_DATA = 6
LINE_OF_TEST = 4

def createTrainDataSet():
    trainDataMat = [[1, 1, 4], 
                    [1, 2, 3], 
                    [1, -2, 3], 
                    [1, -2, 2], 
                    [1, 0, 1], 
                    [1, 1, 2]]
    trainShares = [1, 1, 1, 0, 0,  0]
    return trainDataMat, trainShares

def createTestDataSet():
    testDataMat = [[1, 1, 1], 
                   [1, 2, 0], 
                   [1, 2, 4], 
                   [1, 1, 3]]
    return testDataMat

def autoNorm(dataSet):
    minVals = dataSet.min(0)
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    normDataSet = zeros(shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - tile(minVals, (m, 1))
    normDataSet = normDataSet / tile(ranges, (m, 1))
    return normDataSet[:LINE_OF_DATA], normDataSet[LINE_OF_DATA:]

def sigmoid(inX):
    return 1.0 / (1 + exp(-inX))

def gradAscent(dataMatIn, classLabels, alpha=0.001, maxCycles=1000):
    dataMatrix = mat(dataMatIn)
    labelMat = mat(classLabels).transpose()
    m, n = shape(dataMatrix)
    weights = ones((n, 1))
    for k in range(maxCycles):
        h = sigmoid(dataMatrix * weights)
        error = (labelMat - h)
        weights = weights + alpha * dataMatrix.transpose() * error
    return weights

def plotBestFit(weights):
    dataMat, labelMat = createTrainDataSet()
    dataArr = array(dataMat)
    n = shape(dataArr)[0]
    xcord1 = []; ycord1 = []
    xcord2 = []; ycord2 = []
    for i in range(n):
        if int(labelMat[i]) == 1:
            xcord1.append(dataArr[i, 1])
            ycord1.append(dataArr[i, 2])
        else:
            xcord2.append(dataArr[i, 1])
            ycord2.append(dataArr[i, 2])
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
    ax.scatter(xcord2, ycord2, s=30, c='green')
    x = arange(-3.0, 3.0, 0.1)
    y = (-weights[0] - weights[1] * x) / weights[2]
    ax.plot(x, y)
    plt.xlabel('X1'); plt.ylabel('X2')
    #plt.show()
    plt.savefig('lena_new_sz.png')

def classifyVector(inX, weights):
    prob = sigmoid(sum(inX * weights))
    if prob > 0.5:
        return 1
    else:
        return 0

def classifyAll(dataSet, weights):
    predict = []
    for vector in dataSet:
        predict.append(classifyVector(vector, weights))
    return predict

def main():
    trainDataSet, trainShares = createTrainDataSet()
    testDataSet = createTestDataSet()
    #trainDataSet, testDataSet = autoNorm(vstack((mat(trainDataSet), mat(testDataSet))))
    regMatrix = gradAscent(trainDataSet, trainShares, 0.01, 600)
    print("regMatrix = \n", regMatrix)
    plotBestFit(regMatrix.getA())
    predictShares = classifyAll(testDataSet, regMatrix)
    print("predictResult: \n", predictShares)

if __name__ == '__main__':
    start = time.clock()
    main()
    end = time.clock()
    print('finish all in %s' % str(end - start))

另一个LR的例子

# -*- coding: utf-8 -*-
from numpy import *
from sklearn.datasets import load_iris     # import datasets
from sklearn.externals import joblib 
# load the dataset: iris
iris = load_iris()
samples = iris.data
#print samples 
target = iris.target

# import the LogisticRegression
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()  # 使用类,参数全是默认的
classifier.fit(samples, target)  # 训练数据来学习,不需要返回值

x = classifier.predict([[5, 3, 5, 2.5]])  # 测试数据,分类返回标记

print x[0]

#保存模型
joblib.dump(classifier, 'model.pickle')

#载入模型
model = joblib.load('model.pickle')

x = model.predict([[5, 3, 5, 2.5]])  # 测试数据,分类返回标记

print x[0]
print "end"

随机森林


import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
train = pd.read_csv("train2.csv", dtype={"Age": np.float64},)
print train.head(10)

def harmonize_data(titanic):
    titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
    titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
    titanic.loc[titanic["Sex"] == "female", "Sex"] = 1

    titanic["Embarked"] = titanic["Embarked"].fillna("S")

    titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
    titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
    titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2

    titanic["Fare"] = titanic["Fare"].fillna(titanic["Fare"].median())

    return titanic

harmonize_data(train)
print "ok"
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
results = []
sample_leaf_options = list(range(1, 500, 3))
n_estimators_options = list(range(1, 1000, 5))
groud_truth = train['Survived'][601:]

alg = RandomForestClassifier(min_samples_leaf=50, n_estimators=5, random_state=50)
alg.fit(train[predictors][:600], train['Survived'][:600])
predict = alg.predict(train[predictors][601:])
#print groud_truth == predict
results.append((50, 5, (groud_truth == predict).mean()))
#print((groud_truth == predict).mean())

print(results)

资源文件

http://down.51cto.com/data/2450205


随机森林例子2

#!/usr/bin/env python
# coding=utf8

import sys
reload(sys)
sys.setdefaultencoding('utf8')

'''
Competition URL: https://www.kaggle.com/c/digit-recognizer
Solution:  Random Forest
'''

# 引入需要的包包
# 数据处理的常用包包
import numpy as np
import pandas as pd

# 随机森林的包包
import sklearn as skl
from sklearn.ensemble import RandomForestClassifier

# 画图的包包
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
# 读取数据(请先去 https://www.kaggle.com/c/digit-recognizer/data 上下载数据)
# 读取成DataFrame的数据
train_df = pd.read_csv('train.csv')
# 将DataFrame的数据转换成Array
train_data = train_df.values

test_df = pd.read_csv('test.csv')
test_data = test_df.values

print "head--------------"
print train_df.head()
print "train_data-------------"
print train_data

# 画图
plt.figure(figsize=(12,8))
sns.countplot(x='label', data=train_df)
plt.title('Distribution of Numbers')
plt.xlabel('Numbers');

# 2/3的train_data作为训练数据,1/3的train_data作为测试数据来训练模型
num_features = train_data.shape[0] # 拿到train_data的行数,也就是所有数据的个数作为特征值
print("Number of all features: \t\t", num_features)
split = int(num_features * 2/3) # 这里是取2/3行也就是前28000行作为训练 后1/3也就是14000作为测试

train = train_data[:split] # 取出前28000行作为训练数据
test = train_data[split:] # 取出后14000行作为测试数据

print("Number of features used for training: \t", len(train),
      "\nNumber of features used for testing: \t", len(test))

# 开始使用随机森林分类器
clf = RandomForestClassifier(n_estimators=100) # 定义决策树的个数为100

# 开始训练,训练的X数据格式为[[]],训练的y值为[]也就是经过ravel后的数据
# 如果你问我ravel()的作用是什么,就是不管什么数据格式的数据都转成一个一维的array,这样每个元素都是一个平等且顺序的位置
model = clf.fit(train[:,1:], train[:,0].ravel())

# 然后预测
output = model.predict(test[:,1:])

# 计算准确度
acc = np.mean(output == test[:,0].ravel()) *100
print("The accuracy of the pure RandomForest classifier is: \t", acc, "%")

# 利用
clf = RandomForestClassifier(n_estimators=100) # 100 trees

# 用全部训练数据来做训练
target = train_data[:,0].ravel()
train = train_data[:,1:]
model = clf.fit(train, target)

# 用测试集数据来预测最终结果
output = model.predict(test_data)
print output

# 输出预测结果
pd.DataFrame({"ImageId": range(1, len(output)+1), "Label": output}).to_csv('out.csv', index=False, header=True)

资源文件
https://www.kaggle.com/c/digit-recognizer


FM 例子, fm与als 都属于矩阵分解算法,前者适合分类回归, 后者适合协同过滤

#coding:UTF-8

from __future__ import division
from math import exp
import numpy as np
from numpy import *
from random import normalvariate#正态分布
from datetime import datetime

trainData = 'diabetes_train.txt'
testData = 'diabetes_test.txt'
featureNum = 8
max_list = []
min_list = []

def normalize(x_list,max_list,min_list):
    index = 0
    scalar_list = []
    for x in x_list:
        x_max = max_list[index]
        x_min = min_list[index]
        if x_max == x_min:
            x = 1.0    
        else:
            x = round((x-x_min)/(x_max-x_min),4)
        scalar_list.append(x)
        index += 1
    return scalar_list

def loadTrainDataSet(data):
    global max_list
    global min_list
    dataMat = []
    labelMat = []

    fr = open(data)#打开文件

    for line in fr.readlines():
        currLine = line.strip().split(',')
        #lineArr = [1.0]
        lineArr = []

        for i in xrange(featureNum):
            lineArr.append(float(currLine[i]))

        dataMat.append(lineArr)

        labelMat.append(float(currLine[-1]) * 2 - 1)

    data_array = np.array(dataMat)
    max_list = np.max(data_array,axis=0)
    min_list = np.min(data_array,axis=0)

    scalar_dataMat = []
    for row in dataMat:
        scalar_row = normalize(row,max_list,min_list)
        scalar_dataMat.append(scalar_row)
    return scalar_dataMat, labelMat

def loadTestDataSet(data):
    global max_list
    global min_list
    dataMat = []
    labelMat = []

    fr = open(data)#打开文件

    for line in fr.readlines():
        currLine = line.strip().split(',')
        lineArr = []

        for i in xrange(featureNum):
            lineArr.append(float(currLine[i]))

        dataMat.append(lineArr)

        labelMat.append(float(currLine[-1]) * 2 - 1)

    data_array = np.array(dataMat)

    scalar_dataMat = []
    for row in dataMat:
        scalar_row = normalize(row,max_list,min_list)
        scalar_dataMat.append(scalar_row)
    return scalar_dataMat, labelMat

def sigmoid(inx):
    return 1. / (1. + exp(-max(min(inx, 15.), -15.)))
    #return 1.0 / (1 + exp(-inx))

def stocGradAscent(dataMatrix, classLabels, k, iter):
    #dataMatrix用的是mat, classLabels是列表
    m, n = shape(dataMatrix)
    alpha = 0.01
    #初始化参数
    #w = random.randn(n, 1)#其中n是特征的个数
    w = zeros((n, 1))
    w_0 = 0.
    v = normalvariate(0, 0.2) * ones((n, k))

    for it in xrange(iter):
        print it
        for x in xrange(m):#随机优化,对每一个样本而言的
            inter_1 = dataMatrix[x] * v
            inter_2 = multiply(dataMatrix[x], dataMatrix[x]) * multiply(v, v)#multiply对应元素相乘
            #完成交叉项
            interaction = sum(multiply(inter_1, inter_1) - inter_2) / 2.

            p = w_0 + dataMatrix[x] * w + interaction#计算预测的输出
            #print "y: ",p 
            loss = sigmoid(classLabels[x] * p[0, 0]) - 1
            #print "loss: ",loss

            w_0 = w_0 - alpha * loss * classLabels[x]

            for i in xrange(n):
                if dataMatrix[x, i] != 0:
                    w[i, 0] = w[i, 0] - alpha * loss * classLabels[x] * dataMatrix[x, i]
                    for j in xrange(k):
                        v[i, j] = v[i, j] - alpha * loss * classLabels[x] * (dataMatrix[x, i] * inter_1[0, j] - v[i, j] * dataMatrix[x, i] * dataMatrix[x, i])

    return w_0, w, v

def getAccuracy(dataMatrix, classLabels, w_0, w, v):
    m, n = shape(dataMatrix)
    allItem = 0
    error = 0
    result = []
    for x in xrange(m):
        allItem += 1
        inter_1 = dataMatrix[x] * v
        inter_2 = multiply(dataMatrix[x], dataMatrix[x]) * multiply(v, v)#multiply对应元素相乘
        #完成交叉项
        interaction = sum(multiply(inter_1, inter_1) - inter_2) / 2.
        p = w_0 + dataMatrix[x] * w + interaction#计算预测的输出

        pre = sigmoid(p[0, 0])

        result.append(pre)

        if pre < 0.5 and classLabels[x] == 1.0:
            error += 1
        elif pre >= 0.5 and classLabels[x] == -1.0:
            error += 1
        else:
            continue

    print result

    return float(error) / allItem

if __name__ == '__main__':
    dataTrain, labelTrain = loadTrainDataSet(trainData)
    dataTest, labelTest = loadTestDataSet(testData)
    date_startTrain = datetime.now()
    print "开始训练"
    w_0, w, v = stocGradAscent(mat(dataTrain), labelTrain, 20, 500)
    print "训练准确性为:%f" % (1 - getAccuracy(mat(dataTrain), labelTrain, w_0, w, v))
    date_endTrain = datetime.now()
    print "训练时间为:%s" % (date_endTrain - date_startTrain)
    print "开始测试"
    print "测试准确性为:%f" % (1 - getAccuracy(mat(dataTest), labelTest, w_0, w, v))  

GBTS

#!/usr/bin/env python
# coding=utf8

import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV

import matplotlib.pylab as plt

train = pd.read_csv('train_modified.csv')
target='Disbursed' # Disbursed的值就是二元分类的输出
IDcol = 'ID'
print train['Disbursed'].value_counts()
x_columns = [x for x in train.columns if x not in [target, IDcol]]
X = train[x_columns]
y = train['Disbursed']
gbm0 = GradientBoostingClassifier(random_state=10)
gbm0.fit(X,y)
y_pred = gbm0.predict(X)
y_predprob = gbm0.predict_proba(X)[:,1]
print "Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred)
print "AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob)

资源地址:http://files.cnblogs.com/files/pinard/train_modified.zip


GBTS+LR

# Author: Tim Head <betatim@gmail.com>
#
# License: BSD 3 clause

import numpy as np
np.random.seed(10)

import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.pipeline import make_pipeline

n_estimator = 10
X, y = make_classification(n_samples=80000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
# It is important to train the ensemble of trees on a different subset
# of the training data than the linear regression model to avoid
# overfitting, in particular if the total number of leaves is
# similar to the number of training samples
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.5)

# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator,
    random_state=0)

rt_lm = LogisticRegression()
pipeline = make_pipeline(rt, rt_lm)
pipeline.fit(X_train, y_train)
y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)

# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)

grd = GradientBoostingClassifier(n_estimators=n_estimator)
grd_enc = OneHotEncoder()
grd_lm = LogisticRegression()
grd.fit(X_train, y_train)
grd_enc.fit(grd.apply(X_train)[:, :, 0])
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

y_pred_grd_lm = grd_lm.predict_proba(
    grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)

# The gradient boosted model by itself
y_pred_grd = grd.predict_proba(X_test)[:, 1]
fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)

# The random forest model by itself
y_pred_rf = rf.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)

plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')
plt.plot(fpr_rf, tpr_rf, label='RF')
plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')
plt.plot(fpr_grd, tpr_grd, label='GBT')
plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

plt.figure(2)
plt.xlim(0, 0.2)
plt.ylim(0.8, 1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')
plt.plot(fpr_rf, tpr_rf, label='RF')
plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')
plt.plot(fpr_grd, tpr_grd, label='GBT')
plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve (zoomed in at top left)')
plt.legend(loc='best')
plt.show()