import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import pydot
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
train = pd.read_csv("train2.csv", dtype={"Age": np.float64},)
print train.head(10)

def harmonize_data(titanic):
    titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
    titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
    titanic.loc[titanic["Sex"] == "female", "Sex"] = 1

    titanic["Embarked"] = titanic["Embarked"].fillna("S")

    titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
    titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
    titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2

    titanic["Fare"] = titanic["Fare"].fillna(titanic["Fare"].median())

    return titanic

harmonize_data(train)
print "ok"
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
results = []
sample_leaf_options = list(range(1, 500, 3))
n_estimators_options = list(range(1, 1000, 5))
groud_truth = train['Survived'][601:]

alg = RandomForestClassifier(min_samples_leaf=50, n_estimators=5, random_state=50)
alg.fit(train[predictors][:600], train['Survived'][:600])
predict = alg.predict(train[predictors][601:])
#print groud_truth == predict
results.append((50, 5, (groud_truth == predict).mean()))
#print((groud_truth == predict).mean())

print(results)

Estimators = alg.estimators_
for index, model in enumerate(Estimators):
    filename = 'iris_' + str(index) + '.pdf'
    dot_data = tree.export_graphviz(model , out_file=None,
             feature_names=predictors,
                         class_names=["die","live"],
                         filled=True, rounded=True,
                         special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data)
    Image(graph.create_png())
    graph.write_pdf(filename)

前提需要安装graphviz

yum install graphviz

涉及到的训练集参考上一篇文章