| 167 | |
| 168 | |
| 169 | def CIRCULAR(inp): |
| 170 | def proc_str(s): |
| 171 | chs = set(s) |
| 172 | chs = [x for x in chs if x not in string.ascii_letters and x != ' '] |
| 173 | for ch in chs: |
| 174 | s = s.replace(ch, ' ') |
| 175 | return s |
| 176 | |
| 177 | def abnormal_entry(line): |
| 178 | choices = {k: line[k] for k in string.ascii_uppercase if k in line and not pd.isna(line[k])} |
| 179 | has_label = False |
| 180 | for k in choices: |
| 181 | s = proc_str(choices[k]).split() |
| 182 | hit_words = [x for x in s if x in choices] |
| 183 | hit_words = set(hit_words) |
| 184 | if len(hit_words) > 1: |
| 185 | return True |
| 186 | if choices[k] in string.ascii_uppercase: |
| 187 | has_label = True |
| 188 | return has_label |
| 189 | |
| 190 | assert inp.endswith('.tsv') |
| 191 | data = load(inp) |
| 192 | OFFSET = 1e6 |
| 193 | while max(data['index']) >= OFFSET: |
| 194 | OFFSET *= 10 |
| 195 | n_opt = 2 |
| 196 | for i, ch in enumerate(string.ascii_uppercase): |
| 197 | if ch in data: |
| 198 | n_opt = ord(ch) - ord('A') + 1 |
| 199 | else: |
| 200 | for j in range(i + 1, 26): |
| 201 | assert string.ascii_uppercase[j] not in data |
| 202 | groups = defaultdict(list) |
| 203 | for i in range(len(data)): |
| 204 | item = data.iloc[i] |
| 205 | this_n_opt = 0 |
| 206 | for j, ch in enumerate(string.ascii_uppercase[:n_opt]): |
| 207 | if not pd.isna(item[ch]): |
| 208 | this_n_opt = j + 1 |
| 209 | else: |
| 210 | for k in range(j + 1, n_opt): |
| 211 | assert pd.isna(item[string.ascii_uppercase[k]]), (k, item) |
| 212 | assert this_n_opt >= 2 or this_n_opt == 0 |
| 213 | flag = abnormal_entry(item) |
| 214 | if flag or this_n_opt == 0: |
| 215 | groups['abnormal'].append(item) |
| 216 | elif len(item['answer']) > 1 or item['answer'] not in string.ascii_uppercase[:this_n_opt]: |
| 217 | groups['abnormal'].append(item) |
| 218 | else: |
| 219 | groups[this_n_opt].append(item) |
| 220 | for k in groups: |
| 221 | groups[k] = pd.concat(groups[k], axis=1).T |
| 222 | print(f'{k if k == "abnormal" else str(k) + "-choice"} records: {len(groups[k])}') |
| 223 | |
| 224 | data_all = [] |
| 225 | |
| 226 | for k in groups: |