#### THIS METHOD IS KNOWN AS SULO METHOD in HONOR OF my mother SULOCHANA SESHADRI ####### This highly efficient method removes variables that are highly correlated using a series of pair-wise correlation knockout rounds. It is extremely fast and hence can work on thousands of va
(df, numvars, modeltype, target,
corr_limit=0.70, verbose=0)
| 1857 | |
| 1858 | ################################################################################## |
| 1859 | def remove_variables_using_fast_correlation(df, numvars, modeltype, target, |
| 1860 | corr_limit=0.70, verbose=0): |
| 1861 | """ |
| 1862 | #### THIS METHOD IS KNOWN AS SULO METHOD in HONOR OF my mother SULOCHANA SESHADRI ####### |
| 1863 | This highly efficient method removes variables that are highly correlated using a series of |
| 1864 | pair-wise correlation knockout rounds. It is extremely fast and hence can work on thousands |
| 1865 | of variables in less than a minute, even on a laptop. You need to send in a list of numeric |
| 1866 | variables and that's all! The method defines high Correlation as anything over 0.70 (absolute) |
| 1867 | but this can be changed. If two variables have absolute correlation higher than this, they |
| 1868 | will be marked, and using a process of elimination, one of them will get knocked out: |
| 1869 | To decide order of variables to keep, we use mutual information score to select. MIS returns |
| 1870 | a ranked list of these correlated variables: when we select one, we knock out other variables |
| 1871 | it is correlated to. Then we select next var. This way we remove correlated variables. |
| 1872 | Finally, we are left with uncorrelated variables that are also highly important (mutual score). |
| 1873 | ############## YOU MUST INCLUDE THE ABOVE MESSAGE IF YOU COPY THIS CODE IN YOUR LIBRARY ##### |
| 1874 | """ |
| 1875 | print(' Removing correlated variables from %d numerics using SULO method' % len(numvars)) |
| 1876 | correlation_dataframe = df[numvars].corr() |
| 1877 | a = correlation_dataframe.values |
| 1878 | col_index = correlation_dataframe.columns.tolist() |
| 1879 | index_triupper = list(zip(np.triu_indices_from(a, k=1)[0], np.triu_indices_from(a, k=1)[1])) |
| 1880 | high_corr_index_list = [x for x in np.argwhere(abs(a[np.triu_indices(len(a), k=1)]) >= corr_limit)] |
| 1881 | tuple_list = [y for y in [index_triupper[x[0]] for x in high_corr_index_list]] |
| 1882 | correlated_pair = [(col_index[local_tuple[0]], col_index[local_tuple[1]]) for local_tuple in tuple_list] |
| 1883 | corr_pair_dict = dict(return_dictionary_list(correlated_pair)) |
| 1884 | keys_in_dict = list(corr_pair_dict.keys()) |
| 1885 | reverse_correlated_pair = [(y, x) for (x, y) in correlated_pair] |
| 1886 | reverse_corr_pair_dict = dict(return_dictionary_list(reverse_correlated_pair)) |
| 1887 | for key, val in reverse_corr_pair_dict.items(): |
| 1888 | if key in keys_in_dict: |
| 1889 | if len(key) > 1: |
| 1890 | corr_pair_dict[key] += val |
| 1891 | else: |
| 1892 | corr_pair_dict[key] = val |
| 1893 | flat_corr_pair_list = [item for sublist in correlated_pair for item in sublist] |
| 1894 | #### You can make it a dictionary or a tuple of lists. We have chosen the latter here to keep order intact. |
| 1895 | corr_pair_count_dict = count_freq_in_list(flat_corr_pair_list) |
| 1896 | corr_list = [k for (k, v) in corr_pair_count_dict] |
| 1897 | ###### This is for ordering the variables from highest to lowest importance by target ### |
| 1898 | if len(corr_list) == 0: |
| 1899 | print('Selecting all (%d) variables since none of them are highly correlated...' % len(numvars)) |
| 1900 | return numvars |
| 1901 | else: |
| 1902 | max_feats = len(corr_list) |
| 1903 | if modeltype == 'Regression': |
| 1904 | sel_function = mutual_info_regression |
| 1905 | fs = SelectKBest(score_func=sel_function, k=max_feats) |
| 1906 | fs.fit(df[corr_list], df[target]) |
| 1907 | mutual_info = dict(zip(corr_list, fs.scores_)) |
| 1908 | else: |
| 1909 | sel_function = mutual_info_classif |
| 1910 | fs = SelectKBest(score_func=sel_function, k=max_feats) |
| 1911 | fs.fit(df[corr_list], df[target]) |
| 1912 | mutual_info = dict(zip(corr_list, fs.scores_)) |
| 1913 | #### The first variable in list has the highest correlation to the target variable ### |
| 1914 | sorted_by_mutual_info = [key for (key, val) in sorted(mutual_info.items(), key=lambda kv: kv[1], reverse=True)] |
| 1915 | ##### Now we select the final list of correlated variables ########### |
| 1916 | selected_corr_list = [] |
no test coverage detected