weka cfsSubsetEvalue.java 中文注释
2015-11-27 17:34
375 查看
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*本程序为免费软件;可以通过免费软件中心的发布
* 的GNU公共许可的任何版本下下重写或者修改它.
* 开发本程序的目的是是希望它是有用的,但没有任
* 何授权,甚至没有潜在的商用或其他特殊目的的授
* 权.想要了解更多细节,请参阅GNU公共许可.
* 在拿到程序的同时,你应该收到GNU公共许可;假如
* 没有的话,请致函免费软件中心.
*/
*/
/*
* CfsSubsetEval.java
* Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
*
*/
package weka.attributeSelection;
import java.util.BitSet;
import java.util.Enumeration;
import java.util.Vector;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.ContingencyTables;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.supervised.attribute.Discretize;
/**
* <!-- globalinfo-start --> CfsSubsetEval :<br/>
* <br/>
* Evaluates the worth of a subset of attributes by considering the individual
* predictive ability of each feature along with the degree of redundancy
*根据属性子集中每一个特征的预测能力以及它们之间的关联性进行评估
* between them.<br/>
* <br/>
* Subsets of features that are highly correlated with the class while having
* low intercorrelation are preferred.<br/>
* 与类具有高相关的属性子集被推荐选择。
*<br/>
* For more information see:<br/>
* <br/>
* M. A. Hall (1998). Correlation-based Feature Subset Selection for Machine
* Learning. Hamilton, New Zealand.
* <p/>
* <!-- globalinfo-end -->
*
* <!-- technical-bibtex-start --> BibTeX:
*
* <pre>
* @phdthesis{Hall1998,
* address = {Hamilton, New Zealand},
* author = {M. A. Hall},
* school = {University of Waikato},
* title = {Correlation-based Feature Subset Selection for Machine Learning},
* year = {1998}
* }
* </pre>
* <p/>
* <!-- technical-bibtex-end -->
*
* <!-- options-start --> Valid options are:
* <p/>
*
* <pre>
* -M
* Treat missing values as a separate value.
* </pre>
*
* <pre>
* -L
* Don't include locally predictive attributes.
* </pre>
*
* <!-- options-end -->
*
* @author Mark Hall (mhall@cs.waikato.ac.nz)
* @version $Revision: 11851 $
* @see Discretize
*/
public class CfsSubsetEval
extends ASEvaluation
implements SubsetEvaluator,
OptionHandler,
TechnicalInformationHandler {
/** for serialization */
static final long serialVersionUID = 747878400813276317L;
/** The training instances
* 训练实例集
*/
private Instances m_trainInstances;
/** Discretise attributes when class in nominal
* 当属性是名词形式时,保存属性的离散化的值
*/
private Discretize m_disTransform;
/** The class index
* 类标
*/
private int m_classIndex;
/** Is the class numeric
* 存储属性是否是数值型信息
*/
private boolean m_isNumeric;
/** Number of attributes in the training data
* 储存训练集中属性的个数
*/
private int m_numAttribs;
/** Number of instances in the training data
* 储存训练集中实例的个数
*/
private int m_numInstances;
/** Treat missing values as separate values
* 存储是否是缺省值
*/
private boolean m_missingSeparate;
/** Include locally predicitive attributes
*包含局部预测的属性
*/
private boolean m_locallyPredictive;
/** Holds the matrix of attribute correlations */
// private Matrix m_corr_matrix;
// 求相关系数的矩阵
private float[][] m_corr_matrix;
/** Standard deviations of attributes (when using pearsons correlation) */
//储存属性的标准差
private double[] m_std_devs;
/** Threshold for admitting locally predictive features */
//允许局部预测属性的阈值
private double m_c_Threshold;
/**
* Returns a string describing this attribute evaluator
* 返回一个描述该评价器的字符串,以方便在浏览器和图形化界面展示
*
* @return a description of the evaluator suitable for displaying in the
* explorer/experimenter gui
*/
public String globalInfo() {
return "CfsSubsetEval :\n\nEvaluates the worth of a subset of attributes "
+ "by considering the individual predictive ability of each feature "
+ "along with the degree of redundancy between them.\n\n"
+ "Subsets of features that are highly correlated with the class "
+ "while having low intercorrelation are preferred.\n\n"
+ "For more information see:\n\n"
+ getTechnicalInformation().toString();
}
/**
* Returns an instance of a TechnicalInformation object, containing detailed
* information about the technical background of this class, e.g., paper
* reference or book this class is based on.
* 返回TechnicalInformation类的一个实例,该实例包括当前类的详细技术背景,
* 所涉及到的论文,书籍等
* @return the technical information about this class
*/
@Override
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;
result = new TechnicalInformation(Type.PHDTHESIS);
result.setValue(Field.AUTHOR, "M. A. Hall");
result.setValue(Field.YEAR, "1998");
result.setValue(Field.TITLE,
"Correlation-based Feature Subset Selection for Machine Learning");
result.setValue(Field.SCHOOL, "University of Waikato");
result.setValue(Field.ADDRESS, "Hamilton, New Zealand");
return result;
}
/**
* Constructor
*/
//构造函数
public CfsSubsetEval() {
resetOptions(); //调用resetOptions方法
}
/**
* Returns an enumeration describing the available options.
* 返回一个枚举类型的值来描述可用的选项
* @return an enumeration of all the available options.
*
**/
@Override
public Enumeration listOptions() {
Vector newVector = new Vector(3);
newVector.addElement(new Option("\tTreat missing values as a separate "
+ "value.", "M", 0, "-M"));
newVector.addElement(new Option(
"\tDon't include locally predictive attributes"
+ ".", "L", 0, "-L"));
return newVector.elements();
}
/**
* Parses and sets a given list of options.
* <p/>
*
* <!-- options-start --> Valid options are:
* <p/>
*
* <pre>
* -M
* Treat missing values as a separate value.
* </pre>
*
* <pre>
* -L
* Don't include locally predictive attributes.
* </pre>
*
* <!-- options-end -->
*
* @param options the list of options as an array of strings
* @throws Exception if an option is not supported
*
**/
//解析并且设定一个选项列表,将缺失的值作为一个分离值,去掉局部的预测属性
@Override
public void setOptions(String[] options)
throws Exception {
resetOptions();
setMissingSeparate(Utils.getFlag('M', options));
setLocallyPredictive(!Utils.getFlag('L', options));
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
// 返回局部预测属性的提示信息
public String locallyPredictiveTipText() {
return "Identify locally predictive attributes. Iteratively adds "
+ "attributes with the highest correlation with the class as long "
+ "as there is not already an attribute in the subset that has a "
+ "higher correlation with the attribute in question";
}
/**
* Include locally predictive attributes
*
* @param b true or false
*/
//设置是否包括局部预测属性
public void setLocallyPredictive(boolean b) {
m_locallyPredictive = b;
}
/**
* Return true if including locally predictive attributes
*
* @return true if locally predictive attributes are to be used
*/
//查询是否包含局部预测属性,并返回true或false
public boolean getLocallyPredictive() {
return m_locallyPredictive;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
//返回缺省值属性提示信息
public String missingSeparateTipText() {
return "Treat missing as a separate value. Otherwise, counts for missing "
+ "values are distributed across other values in proportion to their "
+ "frequency.";
}
/**
* Treat missing as a separate value
*
* @param b true or false
*/
//设置是否为缺省值
public void setMissingSeparate(boolean b) {
m_missingSeparate = b;
}
/**
* Return true is missing is treated as a separate value
*
* @return true if missing is to be treated as a separate value
*/
// 查询是否为缺省值
public boolean getMissingSeparate() {
return m_missingSeparate;
}
/**
* Gets the current settings of CfsSubsetEval
*
* @return an array of strings suitable for passing to setOptions()
*/
//获取CfsSubsetEval的当前设置
@Override
public String[] getOptions() {
String[] options = new String[2];
int current = 0;
if (getMissingSeparate()) {
options[current++] = "-M";
}
if (!getLocallyPredictive()) {
options[current++] = "-L";
}
while (current < options.length) {
options[current++] = "";
}
return options;
}
/**
* Returns the capabilities of this evaluator.
*
* @return the capabilities of this evaluator
* @see Capabilities
*/
//返回评价器的评价性能
@Override
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities(); //声明一个评价结果类
result.disableAll();
// 设置可以评价的属性的类型
result.enable(Capability.NOMINAL_ATTRIBUTES);
result.enable(Capability.NUMERIC_ATTRIBUTES);
result.enable(Capability.DATE_ATTRIBUTES);
result.enable(Capability.MISSING_VALUES);
// 设置可以评价的类的类型
result.enable(Capability.NOMINAL_CLASS);
result.enable(Capability.NUMERIC_CLASS);
result.enable(Capability.DATE_CLASS);
result.enable(Capability.MISSING_CLASS_VALUES);
return result;
}
/**
* Generates a attribute evaluator. Has to initialize all fields of the
* evaluator that are not being set via options.
*
* CFS also discretises attributes (if necessary) and initializes the
* correlation matrix.
*
* @param data set of instances serving as training data
* @throws Exception if the evaluator has not been generated successfully
*/
//产生一个评价器,必须初始化评价器中所有未设置偏重选项的量
@Override
public void buildEvaluator(Instances data)
throws Exception {
// can evaluator handle data?
getCapabilities().testWithFail(data);
m_trainInstances = new Instances(data); //声明一个实例集
m_trainInstances.deleteWithMissingClass();
m_classIndex = m_trainInstances.classIndex(); //获得类标
m_numAttribs = m_trainInstances.numAttributes(); //获得属性的数量
m_numInstances = m_trainInstances.numInstances(); //获得实例的数量
m_isNumeric = m_trainInstances.attribute(m_classIndex).isNumeric(); //判断属性是不是数值型
if (!m_isNumeric) { //如果属性不是数值型,则进行离散化
m_disTransform = new Discretize();
m_disTransform.setUseBetterEncoding(true);
m_disTransform.setInputFormat(m_trainInstances);
m_trainInstances = Filter.useFilter(m_trainInstances, m_disTransform);
}
m_std_devs = new double[m_numAttribs]; //声明一个有m_numAttribs个元素的存储属性标准差的数组
m_corr_matrix = new float[m_numAttribs][]; //声明一个有m_numAttribs*m_numAttribs个元素存储的属性之间相关性的数组
for (int i = 0; i < m_numAttribs; i++) {
m_corr_matrix[i] = new float[i + 1];
}
for (int i = 0; i < m_corr_matrix.length; i++) { //为前面声明的两个数组赋初值
m_corr_matrix[i][i] = 1.0f; //为关联性矩阵的对角线赋初值
m_std_devs[i] = 1.0;
}
for (int i = 0; i < m_numAttribs; i++) {
for (int j = 0; j < m_corr_matrix[i].length - 1; j++) {
m_corr_matrix[i][j] = -999; //为相关性矩阵的非对角线部分赋初值
}
}
}
/**
* evaluates a subset of attributes
*
* @param subset a bitset representing the attribute subset to be evaluated
* @return the merit
* @throws Exception if the subset could not be evaluated
*/
//对一个子集进行评价
@Override
public double evaluateSubset(BitSet subset)
throws Exception {
double num = 0.0;
double denom = 0.0;
float corr;
int larger, smaller;
// do numerator 求分子运算
for (int i = 0; i < m_numAttribs; i++) {
if (i != m_classIndex) {
if (subset.get(i)) {
if (i > m_classIndex) {
larger = i;
smaller = m_classIndex;
} else {
smaller = i;
larger = m_classIndex;
}
/*
* int larger = (i > m_classIndex ? i : m_classIndex); int smaller =
* (i > m_classIndex ? m_classIndex : i);
*/
if (m_corr_matrix[larger][smaller] == -999) {
corr = correlate(i, m_classIndex);
m_corr_matrix[larger][smaller] = corr;
num += (m_std_devs[i] * corr);
}
else {
num += (m_std_devs[i] * m_corr_matrix[larger][smaller]);
}
}
}
}
// do denominator 求分母运算
for (int i = 0; i < m_numAttribs; i++) {
if (i != m_classIndex) {
if (subset.get(i)) {
denom += (1.0 * m_std_devs[i] * m_std_devs[i]);
for (int j = 0; j < m_corr_matrix[i].length - 1; j++) {
if (subset.get(j)) {
if (m_corr_matrix[i][j] == -999) {
corr = correlate(i, j);
m_corr_matrix[i][j] = corr;
denom += (2.0 * m_std_devs[i] * m_std_devs[j] * corr);
}
else {
denom +=
(2.0 * m_std_devs[i] * m_std_devs[j] * m_corr_matrix[i][j]);
}
}
}
}
}
}
if (denom < 0.0) {
denom *= -1.0;
}
if (denom == 0.0) {
return (0.0);
}
double merit = (num / Math.sqrt(denom)); //计算评价结果
if (merit < 0.0) {
merit *= -1.0;
}
return merit; //返回评价结果
}
// 求两个属性的相关性
private float correlate(int att1, int att2) {
if (!m_isNumeric) {
return (float) symmUncertCorr(att1, att2);
}
// 判断属性是否为数值型的
boolean att1_is_num = (m_trainInstances.attribute(att1).isNumeric());
boolean att2_is_num = (m_trainInstances.attribute(att2).isNumeric());
//如果两个属性都为数值型,则调用num_num函数求解
if (att1_is_num && att2_is_num) {
return (float) num_num(att1, att2);
}
else { //如果两个属性中,第二个属性是数值型,则调用num_nom2方法求解
if (att2_is_num) {
return (float) num_nom2(att1, att2);
}
else { //如果两个属性中,第一个属性是数值型,则调用num_nom1方法求解
if (att1_is_num) {
return (float) num_nom2(att2, att1);
}
}
}
// 如果两个属性都不是数值型,则调用nom_nom函数求解
return (float) nom_nom(att1, att2);
}
// 求属性的不确定性系数
private double symmUncertCorr(int att1, int att2) {
int i, j, k, ii, jj;
int ni, nj;
double sum = 0.0;
double sumi[], sumj[];
double counts[][];
Instance inst;
double corr_measure;
boolean flag = false;
double temp = 0.0;
if (att1 == m_classIndex || att2 == m_classIndex) {
flag = true;
}
ni = m_trainInstances.attribute(att1).numValues() + 1;
nj = m_trainInstances.attribute(att2).numValues() + 1;
counts = new double[ni][nj];
sumi = new double[ni];
sumj = new double[nj];
for (i = 0; i < ni; i++) {
sumi[i] = 0.0;
for (j = 0; j < nj; j++) {
sumj[j] = 0.0;
counts[i][j] = 0.0;
}
}
// Fill the contingency table
// 构造可能性表
for (i = 0; i < m_numInstances; i++) {
inst = m_trainInstances.instance(i);
if (inst.isMissing(att1)) {
ii = ni - 1;
}
else {
ii = (int) inst.value(att1);
}
if (inst.isMissing(att2)) {
jj = nj - 1;
}
else {
jj = (int) inst.value(att2);
}
counts[ii][jj]++;
}
// get the row totals 求的行的总和
for (i = 0; i < ni; i++) {
sumi[i] = 0.0;
for (j = 0; j < nj; j++) {
sumi[i] += counts[i][j];
sum += counts[i][j];
}
}
// get the column totals 求的列的总和
for (j = 0; j < nj; j++) {
sumj[j] = 0.0;
for (i = 0; i < ni; i++) {
sumj[j] += counts[i][j];
}
}
// distribute missing counts // 分离出缺省值的计数
if (!m_missingSeparate &&
(sumi[ni - 1] < m_numInstances) &&
(sumj[nj - 1] < m_numInstances)) {
double[] i_copy = new double[sumi.length];
double[] j_copy = new double[sumj.length];
double[][] counts_copy = new double[sumi.length][sumj.length];
for (i = 0; i < ni; i++) {
System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length);
}
System.arraycopy(sumi, 0, i_copy, 0, sumi.length);
System.arraycopy(sumj, 0, j_copy, 0, sumj.length);
double total_missing =
(sumi[ni - 1] + sumj[nj - 1] - counts[ni - 1][nj - 1]);
// do the missing i's 计算缺失的行数
if (sumi[ni - 1] > 0.0) {
for (j = 0; j < nj - 1; j++) {
if (counts[ni - 1][j] > 0.0) {
for (i = 0; i < ni - 1; i++) {
temp = ((i_copy[i] / (sum - i_copy[ni - 1])) * counts[ni - 1][j]);
counts[i][j] += temp;
sumi[i] += temp;
}
counts[ni - 1][j] = 0.0;
}
}
}
sumi[ni - 1] = 0.0;
// do the missing j's //计算缺失的列数
if (sumj[nj - 1] > 0.0) {
for (i = 0; i < ni - 1; i++) {
if (counts[i][nj - 1] > 0.0) {
for (j = 0; j < nj - 1; j++) {
temp = ((j_copy[j] / (sum - j_copy[nj - 1])) * counts[i][nj - 1]);
counts[i][j] += temp;
sumj[j] += temp;
}
counts[i][nj - 1] = 0.0;
}
}
}
sumj[nj - 1] = 0.0;
// do the both missing //计算整体的缺失
if (counts[ni - 1][nj - 1] > 0.0 && total_missing != sum) {
for (i = 0; i < ni - 1; i++) {
for (j = 0; j < nj - 1; j++) {
temp = (counts_copy[i][j] / (sum - total_missing)) *
counts_copy[ni - 1][nj - 1];
counts[i][j] += temp;
sumi[i] += temp;
sumj[j] += temp;
}
}
counts[ni - 1][nj - 1] = 0.0;
}
}
corr_measure = ContingencyTables.symmetricalUncertainty(counts);
if (Utils.eq(corr_measure, 0.0)) {
if (flag == true) {
return (0.0);
}
else {
return (1.0);
}
}
else {
return (corr_measure);
}
}
// 数值型属性相关系数的计算
private double num_num(int att1, int att2) {
int i;
Instance inst;
double r, diff1, diff2, num = 0.0, sx = 0.0, sy = 0.0;
double mx = m_trainInstances.meanOrMode(m_trainInstances.attribute(att1));
double my = m_trainInstances.meanOrMode(m_trainInstances.attribute(att2));
for (i = 0; i < m_numInstances; i++) {
inst = m_trainInstances.instance(i);
diff1 = (inst.isMissing(att1)) ? 0.0 : (inst.value(att1) - mx);
diff2 = (inst.isMissing(att2)) ? 0.0 : (inst.value(att2) - my);
num += (diff1 * diff2);
sx += (diff1 * diff1);
sy += (diff2 * diff2);
}
if (sx != 0.0) {
if (m_std_devs[att1] == 1.0) {
m_std_devs[att1] = Math.sqrt((sx / m_numInstances));
}
}
if (sy != 0.0) {
if (m_std_devs[att2] == 1.0) {
m_std_devs[att2] = Math.sqrt((sy / m_numInstances));
}
}
if ((sx * sy) > 0.0) {
r = (num / (Math.sqrt(sx * sy)));
return ((r < 0.0) ? -r : r);
}
else {
if (att1 != m_classIndex && att2 != m_classIndex) {
return 1.0;
}
else {
return 0.0;
}
}
}
// 数值型和非数值型相关系数的计算
private double num_nom2(int att1, int att2) {
int i, ii, k;
double temp;
Instance inst;
int mx = (int) m_trainInstances.
meanOrMode(m_trainInstances.attribute(att1));
double my = m_trainInstances.
meanOrMode(m_trainInstances.attribute(att2));
double stdv_num = 0.0;
double diff1, diff2;
double r = 0.0, rr;
int nx = (!m_missingSeparate)
? m_trainInstances.attribute(att1).numValues()
: m_trainInstances.attribute(att1).numValues() + 1;
double[] prior_nom = new double[nx];
double[] stdvs_nom = new double[nx];
double[] covs = new double[nx];
for (i = 0; i < nx; i++) {
stdvs_nom[i] = covs[i] = prior_nom[i] = 0.0;
}
// calculate frequencies (and means) of the values of the nominal
// attribute
// 计算名词性属性的概率值
for (i = 0; i < m_numInstances; i++) {
inst = m_trainInstances.instance(i);
if (inst.isMissing(att1)) {
if (!m_missingSeparate) {
ii = mx;
}
else {
ii = nx - 1;
}
}
else {
ii = (int) inst.value(att1);
}
// increment freq for nominal
prior_nom[ii]++;
}
for (k = 0; k < m_numInstances; k++) {
inst = m_trainInstances.instance(k);
// std dev of numeric attribute 数值型属性的标准化
diff2 = (inst.isMissing(att2)) ? 0.0 : (inst.value(att2) - my);
stdv_num += (diff2 * diff2);
//
for (i = 0; i < nx; i++) {
if (inst.isMissing(att1)) {
if (!m_missingSeparate) {
temp = (i == mx) ? 1.0 : 0.0;
}
else {
temp = (i == (nx - 1)) ? 1.0 : 0.0;
}
}
else {
temp = (i == inst.value(att1)) ? 1.0 : 0.0;
}
diff1 = (temp - (prior_nom[i] / m_numInstances));
stdvs_nom[i] += (diff1 * diff1);
covs[i] += (diff1 * diff2);
}
}
// calculate weighted correlation 计算权值的相关性
for (i = 0, temp = 0.0; i < nx; i++) {
// calculate the weighted variance of the nominal 计算权重的方差
temp +=
((prior_nom[i] / m_numInstances) * (stdvs_nom[i] / m_numInstances));
if ((stdvs_nom[i] * stdv_num) > 0.0) {
// System.out.println("Stdv :"+stdvs_nom[i]);
rr = (covs[i] / (Math.sqrt(stdvs_nom[i] * stdv_num)));
if (rr < 0.0) {
rr = -rr;
}
r += ((prior_nom[i] / m_numInstances) * rr);
}
/*
* 如果在范畴属性的特定水平下数值属性的方差为0.然后如果不是这个类,这个
* 最差相关性就是1,如果是这个类,它的最差相关性就是0.
*/
else {
if (att1 != m_classIndex && att2 != m_classIndex) {
r += ((prior_nom[i] / m_numInstances) * 1.0);
}
}
}
// 如果必要的话为这些属性设置标准差
// if ((att1 != classIndex) && (att2 != classIndex)) // =============
if (temp != 0.0) {
if (m_std_devs[att1] == 1.0) {
m_std_devs[att1] = Math.sqrt(temp);
}
}
if (stdv_num != 0.0) {
if (m_std_devs[att2] == 1.0) {
m_std_devs[att2] = Math.sqrt((stdv_num / m_numInstances));
}
}
if (r == 0.0) {
if (att1 != m_classIndex && att2 != m_classIndex) {
r = 1.0;
}
}
return r;
}
// 求非数值型属性的相关系数
private double nom_nom(int att1, int att2) {
int i, j, ii, jj, z;
double temp1, temp2;
Instance inst;
int mx = (int) m_trainInstances.
meanOrMode(m_trainInstances.attribute(att1));
int my = (int) m_trainInstances.
meanOrMode(m_trainInstances.attribute(att2));
double diff1, diff2;
double r = 0.0, rr;
int nx = (!m_missingSeparate)
? m_trainInstances.attribute(att1).numValues()
: m_trainInstances.attribute(att1).numValues() + 1;
int ny = (!m_missingSeparate)
? m_trainInstances.attribute(att2).numValues()
: m_trainInstances.attribute(att2).numValues() + 1;
double[][] prior_nom = new double[nx][ny];
double[] sumx = new double[nx];
double[] sumy = new double[ny];
double[] stdvsx = new double[nx];
double[] stdvsy = new double[ny];
double[][] covs = new double[nx][ny];
for (i = 0; i < nx; i++) {
sumx[i] = stdvsx[i] = 0.0;
}
for (j = 0; j < ny; j++) {
sumy[j] = stdvsy[j] = 0.0;
}
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
covs[i][j] = prior_nom[i][j] = 0.0;
}
}
// 计算非数值型属性的频率
for (i = 0; i < m_numInstances; i++) {
inst = m_trainInstances.instance(i);
if (inst.isMissing(att1)) {
if (!m_missingSeparate) {
ii = mx;
}
else {
ii = nx - 1;
}
}
else {
ii = (int) inst.value(att1);
}
if (inst.isMissing(att2)) {
if (!m_missingSeparate) {
jj = my;
}
else {
jj = ny - 1;
}
}
else {
jj = (int) inst.value(att2);
}
prior_nom[ii][jj]++;
sumx[ii]++;
sumy[jj]++;
}
for (z = 0; z < m_numInstances; z++) {
inst = m_trainInstances.instance(z);
for (j = 0; j < ny; j++) {
if (inst.isMissing(att2)) {
if (!m_missingSeparate) {
temp2 = (j == my) ? 1.0 : 0.0;
}
else {
temp2 = (j == (ny - 1)) ? 1.0 : 0.0;
}
}
else {
temp2 = (j == inst.value(att2)) ? 1.0 : 0.0;
}
diff2 = (temp2 - (sumy[j] / m_numInstances));
stdvsy[j] += (diff2 * diff2);
}
for (i = 0; i < nx; i++) {
if (inst.isMissing(att1)) {
if (!m_missingSeparate) {
temp1 = (i == mx) ? 1.0 : 0.0;
}
else {
temp1 = (i == (nx - 1)) ? 1.0 : 0.0;
}
}
else {
temp1 = (i == inst.value(att1)) ? 1.0 : 0.0;
}
diff1 = (temp1 - (sumx[i] / m_numInstances));
stdvsx[i] += (diff1 * diff1);
for (j = 0; j < ny; j++) {
if (inst.isMissing(att2)) {
if (!m_missingSeparate) {
temp2 = (j == my) ? 1.0 : 0.0;
}
else {
temp2 = (j == (ny - 1)) ? 1.0 : 0.0;
}
}
else {
temp2 = (j == inst.value(att2)) ? 1.0 : 0.0;
}
diff2 = (temp2 - (sumy[j] / m_numInstances));
covs[i][j] += (diff1 * diff2);
}
}
}
// 计算权值的相关性
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
if ((stdvsx[i] * stdvsy[j]) > 0.0) {
// System.out.println("Stdv :"+stdvs_nom[i]);
rr = (covs[i][j] / (Math.sqrt(stdvsx[i] * stdvsy[j])));
if (rr < 0.0) {
rr = -rr;
}
r += ((prior_nom[i][j] / m_numInstances) * rr);
}
//如果在范畴属性的特定水平下数值属性的方差为0.然后如果不是这个类,这个
//最差相关性就是1,如果是这个类,它的最差相关性就是0.
else {
if (att1 != m_classIndex && att2 != m_classIndex) {
r += ((prior_nom[i][j] / m_numInstances) * 1.0);
}
}
}
}
//如果有必要的话,计算权值的标准差
for (i = 0, temp1 = 0.0; i < nx; i++) {
temp1 += ((sumx[i] / m_numInstances) * (stdvsx[i] / m_numInstances));
}
if (temp1 != 0.0) {
if (m_std_devs[att1] == 1.0) {
m_std_devs[att1] = Math.sqrt(temp1);
}
}
for (j = 0, temp2 = 0.0; j < ny; j++) {
temp2 += ((sumy[j] / m_numInstances) * (stdvsy[j] / m_numInstances));
}
if (temp2 != 0.0) {
if (m_std_devs[att2] == 1.0) {
m_std_devs[att2] = Math.sqrt(temp2);
}
}
if (r == 0.0) {
if (att1 != m_classIndex && att2 != m_classIndex) {
r = 1.0;
}
}
return r;
}
// 返回一个描述CFS的字符串
@Override
public String toString() {
StringBuffer text = new StringBuffer();
if (m_trainInstances == null) {
text.append("CFS subset evaluator has not been built yet\n");
}
else {
text.append("\tCFS Subset Evaluator\n");
if (m_missingSeparate) {
text.append("\tTreating missing values as a separate value\n");
}
if (m_locallyPredictive) {
text.append("\tIncluding locally predictive attributes\n");
}
}
return text.toString();
}
//增加局部预测
private void addLocallyPredictive(BitSet best_group) {
int i, j;
boolean done = false;
boolean ok = true;
double temp_best = -1.0;
float corr;
j = 0;
BitSet temp_group = (BitSet) best_group.clone();
int larger, smaller;
while (!done) {
temp_best = -1.0;
// 找出一个最好的,但是在best_group中不存在的数
for (i = 0; i < m_numAttribs; i++) {
if (i > m_classIndex) {
larger = i;
smaller = m_classIndex;
} else {
smaller = i;
larger = m_classIndex;
}
/*
* int larger = (i > m_classIndex ? i : m_classIndex); int smaller = (i
* > m_classIndex ? m_classIndex : i);
*/
if ((!temp_group.get(i)) && (i != m_classIndex)) {
if (m_corr_matrix[larger][smaller] == -999) {
corr = correlate(i, m_classIndex);
m_corr_matrix[larger][smaller] = corr;
}
if (m_corr_matrix[larger][smaller] > temp_best) {
temp_best = m_corr_matrix[larger][smaller];
j = i;
}
}
}
if (temp_best == -1.0) {
done = true;
}
else {
ok = true;
temp_group.set(j);
// 检查与best_group组中已经存在的最不相关的
for (i = 0; i < m_numAttribs; i++) {
if (i > j) {
larger = i;
smaller = j;
} else {
larger = j;
smaller = i;
}
/*
* int larger = (i > j ? i : j); int smaller = (i > j ? j : i);
*/
if (best_group.get(i)) {
if (m_corr_matrix[larger][smaller] == -999) {
corr = correlate(i, j);
m_corr_matrix[larger][smaller] = corr;
}
if (m_corr_matrix[larger][smaller] > temp_best - m_c_Threshold) {
ok = false;
break;
}
}
}
// 如果ok,则添加到best_group组中
if (ok) {
best_group.set(j);
}
}
}
}
// 为包含局部预测属性而调用locallyPredictive
@Override
public int[] postProcess(int[] attributeSet)
throws Exception {
int j = 0;
if (!m_locallyPredictive) {
return attributeSet;
}
BitSet bestGroup = new BitSet(m_numAttribs);
for (int element : attributeSet) {
bestGroup.set(element);
}
addLocallyPredictive(bestGroup);
// 统计被加入best_group数组中的元素个数
for (int i = 0; i < m_numAttribs; i++) {
if (bestGroup.get(i)) {
j++;
}
}
int[] newSet = new int[j];
j = 0;
for (int i = 0; i < m_numAttribs; i++) {
if (bestGroup.get(i)) {
newSet[j++] = i;
}
}
return newSet;
}
//重置实例集
@Override
public void clean() {
if (m_trainInstances != null) {
// save memory
m_trainInstances = new Instances(m_trainInstances, 0);
}
}
//设置选择项
protected void resetOptions() {
m_trainInstances = null;
m_missingSeparate = false;
m_locallyPredictive = true;
m_c_Threshold = 0.0;
}
/**
* Returns the revision string.
*
* @return the revision
*/
//返回版本信息
@Override
public String getRevision() {
return RevisionUtils.extract("$Revision: 11851 $");
}
/**
* Main method for testing this class.
*
* @param args the options
*/
public static void main(String[] args) {
runEvaluator(new CfsSubsetEval(), args);
}
}
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*本程序为免费软件;可以通过免费软件中心的发布
* 的GNU公共许可的任何版本下下重写或者修改它.
* 开发本程序的目的是是希望它是有用的,但没有任
* 何授权,甚至没有潜在的商用或其他特殊目的的授
* 权.想要了解更多细节,请参阅GNU公共许可.
* 在拿到程序的同时,你应该收到GNU公共许可;假如
* 没有的话,请致函免费软件中心.
*/
*/
/*
* CfsSubsetEval.java
* Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
*
*/
package weka.attributeSelection;
import java.util.BitSet;
import java.util.Enumeration;
import java.util.Vector;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.ContingencyTables;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.supervised.attribute.Discretize;
/**
* <!-- globalinfo-start --> CfsSubsetEval :<br/>
* <br/>
* Evaluates the worth of a subset of attributes by considering the individual
* predictive ability of each feature along with the degree of redundancy
*根据属性子集中每一个特征的预测能力以及它们之间的关联性进行评估
* between them.<br/>
* <br/>
* Subsets of features that are highly correlated with the class while having
* low intercorrelation are preferred.<br/>
* 与类具有高相关的属性子集被推荐选择。
*<br/>
* For more information see:<br/>
* <br/>
* M. A. Hall (1998). Correlation-based Feature Subset Selection for Machine
* Learning. Hamilton, New Zealand.
* <p/>
* <!-- globalinfo-end -->
*
* <!-- technical-bibtex-start --> BibTeX:
*
* <pre>
* @phdthesis{Hall1998,
* address = {Hamilton, New Zealand},
* author = {M. A. Hall},
* school = {University of Waikato},
* title = {Correlation-based Feature Subset Selection for Machine Learning},
* year = {1998}
* }
* </pre>
* <p/>
* <!-- technical-bibtex-end -->
*
* <!-- options-start --> Valid options are:
* <p/>
*
* <pre>
* -M
* Treat missing values as a separate value.
* </pre>
*
* <pre>
* -L
* Don't include locally predictive attributes.
* </pre>
*
* <!-- options-end -->
*
* @author Mark Hall (mhall@cs.waikato.ac.nz)
* @version $Revision: 11851 $
* @see Discretize
*/
public class CfsSubsetEval
extends ASEvaluation
implements SubsetEvaluator,
OptionHandler,
TechnicalInformationHandler {
/** for serialization */
static final long serialVersionUID = 747878400813276317L;
/** The training instances
* 训练实例集
*/
private Instances m_trainInstances;
/** Discretise attributes when class in nominal
* 当属性是名词形式时,保存属性的离散化的值
*/
private Discretize m_disTransform;
/** The class index
* 类标
*/
private int m_classIndex;
/** Is the class numeric
* 存储属性是否是数值型信息
*/
private boolean m_isNumeric;
/** Number of attributes in the training data
* 储存训练集中属性的个数
*/
private int m_numAttribs;
/** Number of instances in the training data
* 储存训练集中实例的个数
*/
private int m_numInstances;
/** Treat missing values as separate values
* 存储是否是缺省值
*/
private boolean m_missingSeparate;
/** Include locally predicitive attributes
*包含局部预测的属性
*/
private boolean m_locallyPredictive;
/** Holds the matrix of attribute correlations */
// private Matrix m_corr_matrix;
// 求相关系数的矩阵
private float[][] m_corr_matrix;
/** Standard deviations of attributes (when using pearsons correlation) */
//储存属性的标准差
private double[] m_std_devs;
/** Threshold for admitting locally predictive features */
//允许局部预测属性的阈值
private double m_c_Threshold;
/**
* Returns a string describing this attribute evaluator
* 返回一个描述该评价器的字符串,以方便在浏览器和图形化界面展示
*
* @return a description of the evaluator suitable for displaying in the
* explorer/experimenter gui
*/
public String globalInfo() {
return "CfsSubsetEval :\n\nEvaluates the worth of a subset of attributes "
+ "by considering the individual predictive ability of each feature "
+ "along with the degree of redundancy between them.\n\n"
+ "Subsets of features that are highly correlated with the class "
+ "while having low intercorrelation are preferred.\n\n"
+ "For more information see:\n\n"
+ getTechnicalInformation().toString();
}
/**
* Returns an instance of a TechnicalInformation object, containing detailed
* information about the technical background of this class, e.g., paper
* reference or book this class is based on.
* 返回TechnicalInformation类的一个实例,该实例包括当前类的详细技术背景,
* 所涉及到的论文,书籍等
* @return the technical information about this class
*/
@Override
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;
result = new TechnicalInformation(Type.PHDTHESIS);
result.setValue(Field.AUTHOR, "M. A. Hall");
result.setValue(Field.YEAR, "1998");
result.setValue(Field.TITLE,
"Correlation-based Feature Subset Selection for Machine Learning");
result.setValue(Field.SCHOOL, "University of Waikato");
result.setValue(Field.ADDRESS, "Hamilton, New Zealand");
return result;
}
/**
* Constructor
*/
//构造函数
public CfsSubsetEval() {
resetOptions(); //调用resetOptions方法
}
/**
* Returns an enumeration describing the available options.
* 返回一个枚举类型的值来描述可用的选项
* @return an enumeration of all the available options.
*
**/
@Override
public Enumeration listOptions() {
Vector newVector = new Vector(3);
newVector.addElement(new Option("\tTreat missing values as a separate "
+ "value.", "M", 0, "-M"));
newVector.addElement(new Option(
"\tDon't include locally predictive attributes"
+ ".", "L", 0, "-L"));
return newVector.elements();
}
/**
* Parses and sets a given list of options.
* <p/>
*
* <!-- options-start --> Valid options are:
* <p/>
*
* <pre>
* -M
* Treat missing values as a separate value.
* </pre>
*
* <pre>
* -L
* Don't include locally predictive attributes.
* </pre>
*
* <!-- options-end -->
*
* @param options the list of options as an array of strings
* @throws Exception if an option is not supported
*
**/
//解析并且设定一个选项列表,将缺失的值作为一个分离值,去掉局部的预测属性
@Override
public void setOptions(String[] options)
throws Exception {
resetOptions();
setMissingSeparate(Utils.getFlag('M', options));
setLocallyPredictive(!Utils.getFlag('L', options));
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
// 返回局部预测属性的提示信息
public String locallyPredictiveTipText() {
return "Identify locally predictive attributes. Iteratively adds "
+ "attributes with the highest correlation with the class as long "
+ "as there is not already an attribute in the subset that has a "
+ "higher correlation with the attribute in question";
}
/**
* Include locally predictive attributes
*
* @param b true or false
*/
//设置是否包括局部预测属性
public void setLocallyPredictive(boolean b) {
m_locallyPredictive = b;
}
/**
* Return true if including locally predictive attributes
*
* @return true if locally predictive attributes are to be used
*/
//查询是否包含局部预测属性,并返回true或false
public boolean getLocallyPredictive() {
return m_locallyPredictive;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
//返回缺省值属性提示信息
public String missingSeparateTipText() {
return "Treat missing as a separate value. Otherwise, counts for missing "
+ "values are distributed across other values in proportion to their "
+ "frequency.";
}
/**
* Treat missing as a separate value
*
* @param b true or false
*/
//设置是否为缺省值
public void setMissingSeparate(boolean b) {
m_missingSeparate = b;
}
/**
* Return true is missing is treated as a separate value
*
* @return true if missing is to be treated as a separate value
*/
// 查询是否为缺省值
public boolean getMissingSeparate() {
return m_missingSeparate;
}
/**
* Gets the current settings of CfsSubsetEval
*
* @return an array of strings suitable for passing to setOptions()
*/
//获取CfsSubsetEval的当前设置
@Override
public String[] getOptions() {
String[] options = new String[2];
int current = 0;
if (getMissingSeparate()) {
options[current++] = "-M";
}
if (!getLocallyPredictive()) {
options[current++] = "-L";
}
while (current < options.length) {
options[current++] = "";
}
return options;
}
/**
* Returns the capabilities of this evaluator.
*
* @return the capabilities of this evaluator
* @see Capabilities
*/
//返回评价器的评价性能
@Override
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities(); //声明一个评价结果类
result.disableAll();
// 设置可以评价的属性的类型
result.enable(Capability.NOMINAL_ATTRIBUTES);
result.enable(Capability.NUMERIC_ATTRIBUTES);
result.enable(Capability.DATE_ATTRIBUTES);
result.enable(Capability.MISSING_VALUES);
// 设置可以评价的类的类型
result.enable(Capability.NOMINAL_CLASS);
result.enable(Capability.NUMERIC_CLASS);
result.enable(Capability.DATE_CLASS);
result.enable(Capability.MISSING_CLASS_VALUES);
return result;
}
/**
* Generates a attribute evaluator. Has to initialize all fields of the
* evaluator that are not being set via options.
*
* CFS also discretises attributes (if necessary) and initializes the
* correlation matrix.
*
* @param data set of instances serving as training data
* @throws Exception if the evaluator has not been generated successfully
*/
//产生一个评价器,必须初始化评价器中所有未设置偏重选项的量
@Override
public void buildEvaluator(Instances data)
throws Exception {
// can evaluator handle data?
getCapabilities().testWithFail(data);
m_trainInstances = new Instances(data); //声明一个实例集
m_trainInstances.deleteWithMissingClass();
m_classIndex = m_trainInstances.classIndex(); //获得类标
m_numAttribs = m_trainInstances.numAttributes(); //获得属性的数量
m_numInstances = m_trainInstances.numInstances(); //获得实例的数量
m_isNumeric = m_trainInstances.attribute(m_classIndex).isNumeric(); //判断属性是不是数值型
if (!m_isNumeric) { //如果属性不是数值型,则进行离散化
m_disTransform = new Discretize();
m_disTransform.setUseBetterEncoding(true);
m_disTransform.setInputFormat(m_trainInstances);
m_trainInstances = Filter.useFilter(m_trainInstances, m_disTransform);
}
m_std_devs = new double[m_numAttribs]; //声明一个有m_numAttribs个元素的存储属性标准差的数组
m_corr_matrix = new float[m_numAttribs][]; //声明一个有m_numAttribs*m_numAttribs个元素存储的属性之间相关性的数组
for (int i = 0; i < m_numAttribs; i++) {
m_corr_matrix[i] = new float[i + 1];
}
for (int i = 0; i < m_corr_matrix.length; i++) { //为前面声明的两个数组赋初值
m_corr_matrix[i][i] = 1.0f; //为关联性矩阵的对角线赋初值
m_std_devs[i] = 1.0;
}
for (int i = 0; i < m_numAttribs; i++) {
for (int j = 0; j < m_corr_matrix[i].length - 1; j++) {
m_corr_matrix[i][j] = -999; //为相关性矩阵的非对角线部分赋初值
}
}
}
/**
* evaluates a subset of attributes
*
* @param subset a bitset representing the attribute subset to be evaluated
* @return the merit
* @throws Exception if the subset could not be evaluated
*/
//对一个子集进行评价
@Override
public double evaluateSubset(BitSet subset)
throws Exception {
double num = 0.0;
double denom = 0.0;
float corr;
int larger, smaller;
// do numerator 求分子运算
for (int i = 0; i < m_numAttribs; i++) {
if (i != m_classIndex) {
if (subset.get(i)) {
if (i > m_classIndex) {
larger = i;
smaller = m_classIndex;
} else {
smaller = i;
larger = m_classIndex;
}
/*
* int larger = (i > m_classIndex ? i : m_classIndex); int smaller =
* (i > m_classIndex ? m_classIndex : i);
*/
if (m_corr_matrix[larger][smaller] == -999) {
corr = correlate(i, m_classIndex);
m_corr_matrix[larger][smaller] = corr;
num += (m_std_devs[i] * corr);
}
else {
num += (m_std_devs[i] * m_corr_matrix[larger][smaller]);
}
}
}
}
// do denominator 求分母运算
for (int i = 0; i < m_numAttribs; i++) {
if (i != m_classIndex) {
if (subset.get(i)) {
denom += (1.0 * m_std_devs[i] * m_std_devs[i]);
for (int j = 0; j < m_corr_matrix[i].length - 1; j++) {
if (subset.get(j)) {
if (m_corr_matrix[i][j] == -999) {
corr = correlate(i, j);
m_corr_matrix[i][j] = corr;
denom += (2.0 * m_std_devs[i] * m_std_devs[j] * corr);
}
else {
denom +=
(2.0 * m_std_devs[i] * m_std_devs[j] * m_corr_matrix[i][j]);
}
}
}
}
}
}
if (denom < 0.0) {
denom *= -1.0;
}
if (denom == 0.0) {
return (0.0);
}
double merit = (num / Math.sqrt(denom)); //计算评价结果
if (merit < 0.0) {
merit *= -1.0;
}
return merit; //返回评价结果
}
// 求两个属性的相关性
private float correlate(int att1, int att2) {
if (!m_isNumeric) {
return (float) symmUncertCorr(att1, att2);
}
// 判断属性是否为数值型的
boolean att1_is_num = (m_trainInstances.attribute(att1).isNumeric());
boolean att2_is_num = (m_trainInstances.attribute(att2).isNumeric());
//如果两个属性都为数值型,则调用num_num函数求解
if (att1_is_num && att2_is_num) {
return (float) num_num(att1, att2);
}
else { //如果两个属性中,第二个属性是数值型,则调用num_nom2方法求解
if (att2_is_num) {
return (float) num_nom2(att1, att2);
}
else { //如果两个属性中,第一个属性是数值型,则调用num_nom1方法求解
if (att1_is_num) {
return (float) num_nom2(att2, att1);
}
}
}
// 如果两个属性都不是数值型,则调用nom_nom函数求解
return (float) nom_nom(att1, att2);
}
// 求属性的不确定性系数
private double symmUncertCorr(int att1, int att2) {
int i, j, k, ii, jj;
int ni, nj;
double sum = 0.0;
double sumi[], sumj[];
double counts[][];
Instance inst;
double corr_measure;
boolean flag = false;
double temp = 0.0;
if (att1 == m_classIndex || att2 == m_classIndex) {
flag = true;
}
ni = m_trainInstances.attribute(att1).numValues() + 1;
nj = m_trainInstances.attribute(att2).numValues() + 1;
counts = new double[ni][nj];
sumi = new double[ni];
sumj = new double[nj];
for (i = 0; i < ni; i++) {
sumi[i] = 0.0;
for (j = 0; j < nj; j++) {
sumj[j] = 0.0;
counts[i][j] = 0.0;
}
}
// Fill the contingency table
// 构造可能性表
for (i = 0; i < m_numInstances; i++) {
inst = m_trainInstances.instance(i);
if (inst.isMissing(att1)) {
ii = ni - 1;
}
else {
ii = (int) inst.value(att1);
}
if (inst.isMissing(att2)) {
jj = nj - 1;
}
else {
jj = (int) inst.value(att2);
}
counts[ii][jj]++;
}
// get the row totals 求的行的总和
for (i = 0; i < ni; i++) {
sumi[i] = 0.0;
for (j = 0; j < nj; j++) {
sumi[i] += counts[i][j];
sum += counts[i][j];
}
}
// get the column totals 求的列的总和
for (j = 0; j < nj; j++) {
sumj[j] = 0.0;
for (i = 0; i < ni; i++) {
sumj[j] += counts[i][j];
}
}
// distribute missing counts // 分离出缺省值的计数
if (!m_missingSeparate &&
(sumi[ni - 1] < m_numInstances) &&
(sumj[nj - 1] < m_numInstances)) {
double[] i_copy = new double[sumi.length];
double[] j_copy = new double[sumj.length];
double[][] counts_copy = new double[sumi.length][sumj.length];
for (i = 0; i < ni; i++) {
System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length);
}
System.arraycopy(sumi, 0, i_copy, 0, sumi.length);
System.arraycopy(sumj, 0, j_copy, 0, sumj.length);
double total_missing =
(sumi[ni - 1] + sumj[nj - 1] - counts[ni - 1][nj - 1]);
// do the missing i's 计算缺失的行数
if (sumi[ni - 1] > 0.0) {
for (j = 0; j < nj - 1; j++) {
if (counts[ni - 1][j] > 0.0) {
for (i = 0; i < ni - 1; i++) {
temp = ((i_copy[i] / (sum - i_copy[ni - 1])) * counts[ni - 1][j]);
counts[i][j] += temp;
sumi[i] += temp;
}
counts[ni - 1][j] = 0.0;
}
}
}
sumi[ni - 1] = 0.0;
// do the missing j's //计算缺失的列数
if (sumj[nj - 1] > 0.0) {
for (i = 0; i < ni - 1; i++) {
if (counts[i][nj - 1] > 0.0) {
for (j = 0; j < nj - 1; j++) {
temp = ((j_copy[j] / (sum - j_copy[nj - 1])) * counts[i][nj - 1]);
counts[i][j] += temp;
sumj[j] += temp;
}
counts[i][nj - 1] = 0.0;
}
}
}
sumj[nj - 1] = 0.0;
// do the both missing //计算整体的缺失
if (counts[ni - 1][nj - 1] > 0.0 && total_missing != sum) {
for (i = 0; i < ni - 1; i++) {
for (j = 0; j < nj - 1; j++) {
temp = (counts_copy[i][j] / (sum - total_missing)) *
counts_copy[ni - 1][nj - 1];
counts[i][j] += temp;
sumi[i] += temp;
sumj[j] += temp;
}
}
counts[ni - 1][nj - 1] = 0.0;
}
}
corr_measure = ContingencyTables.symmetricalUncertainty(counts);
if (Utils.eq(corr_measure, 0.0)) {
if (flag == true) {
return (0.0);
}
else {
return (1.0);
}
}
else {
return (corr_measure);
}
}
// 数值型属性相关系数的计算
private double num_num(int att1, int att2) {
int i;
Instance inst;
double r, diff1, diff2, num = 0.0, sx = 0.0, sy = 0.0;
double mx = m_trainInstances.meanOrMode(m_trainInstances.attribute(att1));
double my = m_trainInstances.meanOrMode(m_trainInstances.attribute(att2));
for (i = 0; i < m_numInstances; i++) {
inst = m_trainInstances.instance(i);
diff1 = (inst.isMissing(att1)) ? 0.0 : (inst.value(att1) - mx);
diff2 = (inst.isMissing(att2)) ? 0.0 : (inst.value(att2) - my);
num += (diff1 * diff2);
sx += (diff1 * diff1);
sy += (diff2 * diff2);
}
if (sx != 0.0) {
if (m_std_devs[att1] == 1.0) {
m_std_devs[att1] = Math.sqrt((sx / m_numInstances));
}
}
if (sy != 0.0) {
if (m_std_devs[att2] == 1.0) {
m_std_devs[att2] = Math.sqrt((sy / m_numInstances));
}
}
if ((sx * sy) > 0.0) {
r = (num / (Math.sqrt(sx * sy)));
return ((r < 0.0) ? -r : r);
}
else {
if (att1 != m_classIndex && att2 != m_classIndex) {
return 1.0;
}
else {
return 0.0;
}
}
}
// 数值型和非数值型相关系数的计算
private double num_nom2(int att1, int att2) {
int i, ii, k;
double temp;
Instance inst;
int mx = (int) m_trainInstances.
meanOrMode(m_trainInstances.attribute(att1));
double my = m_trainInstances.
meanOrMode(m_trainInstances.attribute(att2));
double stdv_num = 0.0;
double diff1, diff2;
double r = 0.0, rr;
int nx = (!m_missingSeparate)
? m_trainInstances.attribute(att1).numValues()
: m_trainInstances.attribute(att1).numValues() + 1;
double[] prior_nom = new double[nx];
double[] stdvs_nom = new double[nx];
double[] covs = new double[nx];
for (i = 0; i < nx; i++) {
stdvs_nom[i] = covs[i] = prior_nom[i] = 0.0;
}
// calculate frequencies (and means) of the values of the nominal
// attribute
// 计算名词性属性的概率值
for (i = 0; i < m_numInstances; i++) {
inst = m_trainInstances.instance(i);
if (inst.isMissing(att1)) {
if (!m_missingSeparate) {
ii = mx;
}
else {
ii = nx - 1;
}
}
else {
ii = (int) inst.value(att1);
}
// increment freq for nominal
prior_nom[ii]++;
}
for (k = 0; k < m_numInstances; k++) {
inst = m_trainInstances.instance(k);
// std dev of numeric attribute 数值型属性的标准化
diff2 = (inst.isMissing(att2)) ? 0.0 : (inst.value(att2) - my);
stdv_num += (diff2 * diff2);
//
for (i = 0; i < nx; i++) {
if (inst.isMissing(att1)) {
if (!m_missingSeparate) {
temp = (i == mx) ? 1.0 : 0.0;
}
else {
temp = (i == (nx - 1)) ? 1.0 : 0.0;
}
}
else {
temp = (i == inst.value(att1)) ? 1.0 : 0.0;
}
diff1 = (temp - (prior_nom[i] / m_numInstances));
stdvs_nom[i] += (diff1 * diff1);
covs[i] += (diff1 * diff2);
}
}
// calculate weighted correlation 计算权值的相关性
for (i = 0, temp = 0.0; i < nx; i++) {
// calculate the weighted variance of the nominal 计算权重的方差
temp +=
((prior_nom[i] / m_numInstances) * (stdvs_nom[i] / m_numInstances));
if ((stdvs_nom[i] * stdv_num) > 0.0) {
// System.out.println("Stdv :"+stdvs_nom[i]);
rr = (covs[i] / (Math.sqrt(stdvs_nom[i] * stdv_num)));
if (rr < 0.0) {
rr = -rr;
}
r += ((prior_nom[i] / m_numInstances) * rr);
}
/*
* 如果在范畴属性的特定水平下数值属性的方差为0.然后如果不是这个类,这个
* 最差相关性就是1,如果是这个类,它的最差相关性就是0.
*/
else {
if (att1 != m_classIndex && att2 != m_classIndex) {
r += ((prior_nom[i] / m_numInstances) * 1.0);
}
}
}
// 如果必要的话为这些属性设置标准差
// if ((att1 != classIndex) && (att2 != classIndex)) // =============
if (temp != 0.0) {
if (m_std_devs[att1] == 1.0) {
m_std_devs[att1] = Math.sqrt(temp);
}
}
if (stdv_num != 0.0) {
if (m_std_devs[att2] == 1.0) {
m_std_devs[att2] = Math.sqrt((stdv_num / m_numInstances));
}
}
if (r == 0.0) {
if (att1 != m_classIndex && att2 != m_classIndex) {
r = 1.0;
}
}
return r;
}
// 求非数值型属性的相关系数
private double nom_nom(int att1, int att2) {
int i, j, ii, jj, z;
double temp1, temp2;
Instance inst;
int mx = (int) m_trainInstances.
meanOrMode(m_trainInstances.attribute(att1));
int my = (int) m_trainInstances.
meanOrMode(m_trainInstances.attribute(att2));
double diff1, diff2;
double r = 0.0, rr;
int nx = (!m_missingSeparate)
? m_trainInstances.attribute(att1).numValues()
: m_trainInstances.attribute(att1).numValues() + 1;
int ny = (!m_missingSeparate)
? m_trainInstances.attribute(att2).numValues()
: m_trainInstances.attribute(att2).numValues() + 1;
double[][] prior_nom = new double[nx][ny];
double[] sumx = new double[nx];
double[] sumy = new double[ny];
double[] stdvsx = new double[nx];
double[] stdvsy = new double[ny];
double[][] covs = new double[nx][ny];
for (i = 0; i < nx; i++) {
sumx[i] = stdvsx[i] = 0.0;
}
for (j = 0; j < ny; j++) {
sumy[j] = stdvsy[j] = 0.0;
}
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
covs[i][j] = prior_nom[i][j] = 0.0;
}
}
// 计算非数值型属性的频率
for (i = 0; i < m_numInstances; i++) {
inst = m_trainInstances.instance(i);
if (inst.isMissing(att1)) {
if (!m_missingSeparate) {
ii = mx;
}
else {
ii = nx - 1;
}
}
else {
ii = (int) inst.value(att1);
}
if (inst.isMissing(att2)) {
if (!m_missingSeparate) {
jj = my;
}
else {
jj = ny - 1;
}
}
else {
jj = (int) inst.value(att2);
}
prior_nom[ii][jj]++;
sumx[ii]++;
sumy[jj]++;
}
for (z = 0; z < m_numInstances; z++) {
inst = m_trainInstances.instance(z);
for (j = 0; j < ny; j++) {
if (inst.isMissing(att2)) {
if (!m_missingSeparate) {
temp2 = (j == my) ? 1.0 : 0.0;
}
else {
temp2 = (j == (ny - 1)) ? 1.0 : 0.0;
}
}
else {
temp2 = (j == inst.value(att2)) ? 1.0 : 0.0;
}
diff2 = (temp2 - (sumy[j] / m_numInstances));
stdvsy[j] += (diff2 * diff2);
}
for (i = 0; i < nx; i++) {
if (inst.isMissing(att1)) {
if (!m_missingSeparate) {
temp1 = (i == mx) ? 1.0 : 0.0;
}
else {
temp1 = (i == (nx - 1)) ? 1.0 : 0.0;
}
}
else {
temp1 = (i == inst.value(att1)) ? 1.0 : 0.0;
}
diff1 = (temp1 - (sumx[i] / m_numInstances));
stdvsx[i] += (diff1 * diff1);
for (j = 0; j < ny; j++) {
if (inst.isMissing(att2)) {
if (!m_missingSeparate) {
temp2 = (j == my) ? 1.0 : 0.0;
}
else {
temp2 = (j == (ny - 1)) ? 1.0 : 0.0;
}
}
else {
temp2 = (j == inst.value(att2)) ? 1.0 : 0.0;
}
diff2 = (temp2 - (sumy[j] / m_numInstances));
covs[i][j] += (diff1 * diff2);
}
}
}
// 计算权值的相关性
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
if ((stdvsx[i] * stdvsy[j]) > 0.0) {
// System.out.println("Stdv :"+stdvs_nom[i]);
rr = (covs[i][j] / (Math.sqrt(stdvsx[i] * stdvsy[j])));
if (rr < 0.0) {
rr = -rr;
}
r += ((prior_nom[i][j] / m_numInstances) * rr);
}
//如果在范畴属性的特定水平下数值属性的方差为0.然后如果不是这个类,这个
//最差相关性就是1,如果是这个类,它的最差相关性就是0.
else {
if (att1 != m_classIndex && att2 != m_classIndex) {
r += ((prior_nom[i][j] / m_numInstances) * 1.0);
}
}
}
}
//如果有必要的话,计算权值的标准差
for (i = 0, temp1 = 0.0; i < nx; i++) {
temp1 += ((sumx[i] / m_numInstances) * (stdvsx[i] / m_numInstances));
}
if (temp1 != 0.0) {
if (m_std_devs[att1] == 1.0) {
m_std_devs[att1] = Math.sqrt(temp1);
}
}
for (j = 0, temp2 = 0.0; j < ny; j++) {
temp2 += ((sumy[j] / m_numInstances) * (stdvsy[j] / m_numInstances));
}
if (temp2 != 0.0) {
if (m_std_devs[att2] == 1.0) {
m_std_devs[att2] = Math.sqrt(temp2);
}
}
if (r == 0.0) {
if (att1 != m_classIndex && att2 != m_classIndex) {
r = 1.0;
}
}
return r;
}
// 返回一个描述CFS的字符串
@Override
public String toString() {
StringBuffer text = new StringBuffer();
if (m_trainInstances == null) {
text.append("CFS subset evaluator has not been built yet\n");
}
else {
text.append("\tCFS Subset Evaluator\n");
if (m_missingSeparate) {
text.append("\tTreating missing values as a separate value\n");
}
if (m_locallyPredictive) {
text.append("\tIncluding locally predictive attributes\n");
}
}
return text.toString();
}
//增加局部预测
private void addLocallyPredictive(BitSet best_group) {
int i, j;
boolean done = false;
boolean ok = true;
double temp_best = -1.0;
float corr;
j = 0;
BitSet temp_group = (BitSet) best_group.clone();
int larger, smaller;
while (!done) {
temp_best = -1.0;
// 找出一个最好的,但是在best_group中不存在的数
for (i = 0; i < m_numAttribs; i++) {
if (i > m_classIndex) {
larger = i;
smaller = m_classIndex;
} else {
smaller = i;
larger = m_classIndex;
}
/*
* int larger = (i > m_classIndex ? i : m_classIndex); int smaller = (i
* > m_classIndex ? m_classIndex : i);
*/
if ((!temp_group.get(i)) && (i != m_classIndex)) {
if (m_corr_matrix[larger][smaller] == -999) {
corr = correlate(i, m_classIndex);
m_corr_matrix[larger][smaller] = corr;
}
if (m_corr_matrix[larger][smaller] > temp_best) {
temp_best = m_corr_matrix[larger][smaller];
j = i;
}
}
}
if (temp_best == -1.0) {
done = true;
}
else {
ok = true;
temp_group.set(j);
// 检查与best_group组中已经存在的最不相关的
for (i = 0; i < m_numAttribs; i++) {
if (i > j) {
larger = i;
smaller = j;
} else {
larger = j;
smaller = i;
}
/*
* int larger = (i > j ? i : j); int smaller = (i > j ? j : i);
*/
if (best_group.get(i)) {
if (m_corr_matrix[larger][smaller] == -999) {
corr = correlate(i, j);
m_corr_matrix[larger][smaller] = corr;
}
if (m_corr_matrix[larger][smaller] > temp_best - m_c_Threshold) {
ok = false;
break;
}
}
}
// 如果ok,则添加到best_group组中
if (ok) {
best_group.set(j);
}
}
}
}
// 为包含局部预测属性而调用locallyPredictive
@Override
public int[] postProcess(int[] attributeSet)
throws Exception {
int j = 0;
if (!m_locallyPredictive) {
return attributeSet;
}
BitSet bestGroup = new BitSet(m_numAttribs);
for (int element : attributeSet) {
bestGroup.set(element);
}
addLocallyPredictive(bestGroup);
// 统计被加入best_group数组中的元素个数
for (int i = 0; i < m_numAttribs; i++) {
if (bestGroup.get(i)) {
j++;
}
}
int[] newSet = new int[j];
j = 0;
for (int i = 0; i < m_numAttribs; i++) {
if (bestGroup.get(i)) {
newSet[j++] = i;
}
}
return newSet;
}
//重置实例集
@Override
public void clean() {
if (m_trainInstances != null) {
// save memory
m_trainInstances = new Instances(m_trainInstances, 0);
}
}
//设置选择项
protected void resetOptions() {
m_trainInstances = null;
m_missingSeparate = false;
m_locallyPredictive = true;
m_c_Threshold = 0.0;
}
/**
* Returns the revision string.
*
* @return the revision
*/
//返回版本信息
@Override
public String getRevision() {
return RevisionUtils.extract("$Revision: 11851 $");
}
/**
* Main method for testing this class.
*
* @param args the options
*/
public static void main(String[] args) {
runEvaluator(new CfsSubsetEval(), args);
}
}
相关文章推荐
- 关于UITableView 中两个困惑的问题
- 利用UITextField自定义搜索栏,实现中文输入过程中字母的搜索功能
- Selenium webdriver 学习笔记(三) - 基本UI控件操作
- leetcode之路051 N-Queens
- 23设计模式之建造者模式(Builder)
- Handler, Loop, MessageQueue的工作原理
- 为嵌套在左右滑动的UIScrollview上的UITableView增加滑动删除
- 颜色模型
- Machine Learning week 9 quiz: Anomaly Detection
- HFTP Guide
- HDFS Permissions Guide
- HDFS配额指南(HDFS Quotas Guide)
- Offline Image Viewer Guide
- Offline Edits Viewer Guide
- ViewFs Guide
- 自定义UICollectionViewController只读UICollectionViewLayout
- Android uiautomator(1)-创建工程例子
- EasyUI - 自定义页面等待插件
- NGUI 的grid提高显示性能
- easyUI框架之学习3--表格datagrid