This returns a dictionary of counts of each variable and how many vars it is correlated to in the dataframe
(correlation_dataframe, corr_limit=0.70)
| 1945 | |
| 1946 | |
| 1947 | def find_corr_vars(correlation_dataframe, corr_limit=0.70): |
| 1948 | """ |
| 1949 | This returns a dictionary of counts of each variable and how many vars it is correlated to in the dataframe |
| 1950 | """ |
| 1951 | flatten = lambda l: [item for sublist in l for item in sublist] |
| 1952 | flatten_items = lambda dic: [x for x in dic.items()] |
| 1953 | a = correlation_dataframe.values |
| 1954 | col_index = correlation_dataframe.columns.tolist() |
| 1955 | index_triupper = list(zip(np.triu_indices_from(a, k=1)[0], np.triu_indices_from(a, k=1)[1])) |
| 1956 | high_corr_index_list = [x for x in np.argwhere(abs(a[np.triu_indices(len(a), k=1)]) >= corr_limit)] |
| 1957 | tuple_list = [y for y in [index_triupper[x[0]] for x in high_corr_index_list]] |
| 1958 | correlated_pair = [(col_index[tup[0]], col_index[tup[1]]) for tup in tuple_list] |
| 1959 | correlated_pair_dict = dict(correlated_pair) |
| 1960 | flat_corr_pair_list = [item for sublist in correlated_pair for item in sublist] |
| 1961 | #### You can make it a dictionary or a tuple of lists. We have chosen the latter here to keep order intact. |
| 1962 | # corr_pair_count_dict = Counter(flat_corr_pair_list) |
| 1963 | corr_pair_count_dict = count_freq_in_list(flat_corr_pair_list) |
| 1964 | corr_list = list(set(flatten(flatten_items(correlated_pair_dict)))) |
| 1965 | rem_col_list = left_subtract(list(correlation_dataframe), list(OrderedDict.fromkeys(flat_corr_pair_list))) |
| 1966 | return corr_pair_count_dict, rem_col_list, corr_list, correlated_pair_dict |
| 1967 | |
| 1968 | |
| 1969 | ################################################################################# |
nothing calls this directly
no test coverage detected