import re import difflib #==================================================================== class MatchError(IndexError): def __init__(self, msg = '', items = [], tofind = ''): Exception.__init__(self, msg) self.items = items self.tofind = tofind def __str__(self): return "Could not find '%s' in '%s'"% (self.tofind, self.items) #==================================================================== def clean_text(text): # remove anything after the first tab text_before_tab = re.sub(r"\t.*", "", text) # remove any whitespace or non alphanumeric characters return re.sub(r"[^\w ]|\s+", "", text_before_tab).lower() #==================================================================== def build_unique_index_map(items): mapped_items = {} #counters = {} for i, text in enumerate(items): text = clean_text(text) # no duplicates so just store it without modification if text not in mapped_items: mapped_items[text] = i # else this item appears multiple times else: # find unique text unique_text = text counter = 2 while unique_text in mapped_items: unique_text = text + str(counter) counter += 1 mapped_items[unique_text] = i if not mapped_items.has_key(text + "0"): mapped_items[text + "0"] = mapped_items[text] mapped_items[text + "1"] = mapped_items[text] return mapped_items #==================================================================== def find_best_match(search_text, item_texts, items): search_text = clean_text(search_text) # Clean each item, make it unique and map to # to the item index item_index_map = build_unique_index_map(item_texts) # find the list of best matches matches = difflib.get_close_matches (search_text, item_index_map.keys()) # best match is the first one - so get the index stored # for that match text try: best_index = item_index_map[matches[0]] except IndexError: raise MatchError(items = item_texts, tofind = search_text) return items[best_index] #==================================================================== def get_control_names(control): names = [] # if it has a reference control - then use that if hasattr(control, 'ref') and control.ref: control = control.ref # Add the control based on it's friendly class name names.append(control.FriendlyClassName) # if it has some character text then add it base on that # and based on that with friendly class name appended if clean_text(control.Text): names.append(control.Text) names.append(control.Text + control.FriendlyClassName) # return the names (either 1 or 3 strings) return names #==================================================================== def junk_func(char): if char in ':"/ \t\n\r][{}=-\\|!@#$%^&*,.<>?/()': return True return False #==================================================================== def clean_text2(text): # doesn't change text to lowercase # remove anything after the first tab text_before_tab = re.sub(r"\t.*", "", text) # remove any whitespace or non alphanumeric characters return re.sub(r"\W", "", text_before_tab) #==================================================================== def find_best_control_match(search_text, controls): name_control_map = {} # collect all the possible names for all controls # and build a list of them for c in controls: ctrl_names = get_control_names(c) ctrl_names = [clean_text2(n) for n in ctrl_names] # remove duplicates ctrl_names = list(set(ctrl_names)) # for each of the names for n in ctrl_names: # if its not there already then just add it if not name_control_map.has_key(n): name_control_map[n] = c # else this item appears multiple times else: # find unique name unique_text = n counter = 2 while unique_text in name_control_map: unique_text = n + str(counter) counter += 1 # add it with that unique text name_control_map[unique_text] = c # and if this was the first time that we noticied that # it was a duplicated name then add new items based on the # duplicated name but add '0' and '1' if not name_control_map.has_key(n + "0"): name_control_map[n + "0"] = name_control_map[n] name_control_map[n + "1"] = name_control_map[n] # now time to figre out the matching ratio_calc = difflib.SequenceMatcher() ratio_calc.set_seq1(clean_text2(search_text)) best_ratio = 0 best_control = None for name, control in name_control_map.items(): ratio_calc.set_seq2(name) if ratio_calc.ratio() > best_ratio: best_ratio = ratio_calc.quick_ratio() best_control = control if best_ratio < .5: raise MatchError(items = name_control_map.keys(), tofind = search_text) return best_control