from django.db import models classScrapyModel(models.Model): title = models.CharField(max_length=200) link = models.CharField(max_length=200) desc = models.TextField()
while i < ALPHABET_LEN: delta1[i] = patternlen i += 1
i = 0 while i < patternlen-1: delta1[ord(p[i])] = patternlen-1 - i i += 1 return delta1
deffind(s, p): # find first occurrence of p in s n = len(s) m = len(p) delta1 = make_delta1(p) skip = delta1[ord(p[m-1])] i = 0 while i <= n-m: if s[i+m-1] == p[m-1]: # (boyer-moore) # potential match if s[i:i+m-1] == p[:m-1]: return i if s[i+m] notin p: i = i + m + 1# (sunday) else: i = i + skip # (horspool) else: # skip if s[i+m] notin p: i = i + m + 1# (sunday) else: i = i + 1 return -1# not found
if __name__ == '__main__': print find("HERE IS A SILMPLE EXAMPLE", "EXAMPLE")
#include<stdio.h> #include<string.h> #include<limits.h> /* Returns a pointer to the first occurrence of "needle" * within "haystack", or NULL if not found. Works like * memmem(). */ /* Note: In this example needle is a C string. The ending * 0x00 will be cut off, so you could call this example with * boyermoore_horspool_memmem(haystack, hlen, "abc", sizeof("abc")) */ constunsignedchar * boyermoore_horspool_memmem(constunsignedchar* haystack, size_t hlen, constunsignedchar* needle, size_t nlen) { int index = 0; size_t scan = 0; size_t bad_char_skip[UCHAR_MAX + 1]; /* Officially called: * bad character shift */ /* Sanity checks on the parameters */ if (nlen <= 0 || !haystack || !needle) returnNULL; /* ---- Preprocess ---- */ /* Initialize the table to default value */ /* When a character is encountered that does not occur * in the needle, we can safely skip ahead for the whole * length of the needle. */ for (scan = 0; scan <= UCHAR_MAX; scan = scan + 1) bad_char_skip[scan] = nlen; /* C arrays have the first byte at [0], therefore: * [nlen - 1] is the last byte of the array. */ size_t last = nlen - 1; /* Then populate it with the analysis of the needle */ for (scan = 0; scan < last; scan = scan + 1) bad_char_skip[needle[scan]] = last - scan; /* ---- Do the matching ---- */ /* Search the haystack, while the needle can still be within it. */ while (hlen >= nlen) { /* scan from the end of the needle */ for (scan = last; haystack[scan] == needle[scan]; scan = scan - 1) if (scan == 0) /* If the first byte matches, we've found it. */ { printf("index: %d\n", index-1); return haystack; } /* otherwise, we need to skip some bytes and start again. Note that here we are getting the skip value based on the last byte of needle, no matter where we didn't match. So if needle is: "abcd" then we are skipping based on 'd' and that value will be 4, and for "abcdd" we again skip on 'd' but the value will be only 1. The alternative of pretending that the mismatched character was the last character is slower in the normal case (E.g. finding "abcd" in "...azcd..." gives 4 by using 'd' but only 4-2==2 using 'z'. */ hlen -= bad_char_skip[haystack[last]]; index += bad_char_skip[haystack[last]]; haystack += bad_char_skip[haystack[last]]; } returnNULL; }
#include<stdint.h> #include<stdlib.h> #include<stdio.h> #include<string.h> #define ALPHABET_LEN 256 #define NOT_FOUND patlen #define max(a, b) ((a < b) ? b : a) // delta1 table: delta1[c] contains the distance between the last // character of pat and the rightmost occurence of c in pat. // If c does not occur in pat, then delta1[c] = patlen. // If c is at string[i] and c != pat[patlen-1], we can // safely shift i over by delta1[c], which is the minimum distance // needed to shift pat forward to get string[i] lined up // with some character in pat. // this algorithm runs in alphabet_len+patlen time. voidmake_delta1(int *delta1, uint8_t *pat, int32_t patlen) { int i; for (i=0; i < ALPHABET_LEN; i++) { delta1[i] = NOT_FOUND; } for (i=0; i < patlen-1; i++) { delta1[pat[i]] = patlen-1 - i; } } // true if the suffix of word starting from word[pos] is a prefix // of word intis_prefix(uint8_t *word, int wordlen, int pos) { int i; int suffixlen = wordlen - pos; // could also use the strncmp() library function here for (i = 0; i < suffixlen; i++) { if (word[i] != word[pos+i]) { return0; } } return1; } // length of the longest suffix of word ending on word[pos]. // suffix_length("dddbcabc", 8, 4) = 2 intsuffix_length(uint8_t *word, int wordlen, int pos) { int i; // increment suffix length i to the first mismatch or beginning // of the word for (i = 0; (word[pos-i] == word[wordlen-1-i]) && (i < pos); i++); return i; } // delta2 table: given a mismatch at pat[pos], we want to align // with the next possible full match could be based on what we // know about pat[pos+1] to pat[patlen-1]. // // In case 1: // pat[pos+1] to pat[patlen-1] does not occur elsewhere in pat, // the next plausible match starts at or after the mismatch. // If, within the substring pat[pos+1 .. patlen-1], lies a prefix // of pat, the next plausible match is here (if there are multiple // prefixes in the substring, pick the longest). Otherwise, the // next plausible match starts past the character aligned with // pat[patlen-1]. // // In case 2: // pat[pos+1] to pat[patlen-1] does occur elsewhere in pat. The // mismatch tells us that we are not looking at the end of a match. // We may, however, be looking at the middle of a match. // // The first loop, which takes care of case 1, is analogous to // the KMP table, adapted for a 'backwards' scan order with the // additional restriction that the substrings it considers as // potential prefixes are all suffixes. In the worst case scenario // pat consists of the same letter repeated, so every suffix is // a prefix. This loop alone is not sufficient, however: // Suppose that pat is "ABYXCDEYX", and text is ".....ABYXCDEYX". // We will match X, Y, and find B != E. There is no prefix of pat // in the suffix "YX", so the first loop tells us to skip forward // by 9 characters. // Although superficially similar to the KMP table, the KMP table // relies on information about the beginning of the partial match // that the BM algorithm does not have. // // The second loop addresses case 2. Since suffix_length may not be // unique, we want to take the minimum value, which will tell us // how far away the closest potential match is. voidmake_delta2(int *delta2, uint8_t *pat, int32_t patlen) { int p; int last_prefix_index = patlen-1; // first loop for (p=patlen-1; p>=0; p--) { if (is_prefix(pat, patlen, p+1)) { last_prefix_index = p+1; } delta2[p] = last_prefix_index + (patlen-1 - p); } // second loop for (p=0; p < patlen-1; p++) { int slen = suffix_length(pat, patlen, p); if (pat[p - slen] != pat[patlen-1 - slen]) { delta2[patlen-1 - slen] = patlen-1 - p + slen; } } } uint8_t* boyer_moore(uint8_t *string, uint32_t stringlen, uint8_t *pat, uint32_t patlen) { int i; int delta1[ALPHABET_LEN]; int *delta2 = malloc(patlen * sizeof(int)); make_delta1(delta1, pat, patlen); make_delta2(delta2, pat, patlen); i = patlen-1; while (i < stringlen) { int j = patlen-1; while (j >= 0 && (string[i] == pat[j])) { --i; --j; } if (j < 0) { free(delta2); printf("index: %d\n", i); return (string + i+1); } i += max(delta1[string[i]], delta2[j]); } free(delta2); returnNULL; }