如果单词存在于单词数组中,则将其排除

jay*_*dez 6 c

考虑到此代码可以计算所有出现的次数,您如何删除常用单词?

例如,如果该单词来自前100个英语单词,则不要计算该单词。

如果您根据Wikipedia选出最常见的100个单词,如何将它们添加到数组中并检查以免不在列表中计数:https : //en.wikipedia.org/wiki/Most_common_words_in_English

数组形式的前100个最常用词:

#define NUMBER_OF_STRING 100
#define MAX_STRING_SIZE   50

char commonWords[NUMBER_OF_STRING][MAX_STRING_SIZE] = {"the", "be", "to", "of", "and", "a", "in", "that", "have", "I", "it", "for", "not", "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from", "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would", "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which", "go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take", "people", "into", "year", "your", "good", "some", "could", "them", "see", "other", "than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back", "after", "use", "two", "how", "our", "work", "first", "well", "way", "even", "new", "want", "because", "any", "these", "give", "day", "most", "us"};
Run Code Online (Sandbox Code Playgroud)

代码示例:

/**
 * C program to count occurrences of all words in a file.
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>

#define MAX_WORD  20000     /* max word size */
#define MAX_WORDS     8     /* initial number of struct to allocate */

#ifndef PATH_MAX
#define PATH_MAX   2048     /* max path (defined for Linux in limits.h) */
#endif

typedef struct {            /* use a struct to hold */
    char word[MAX_WORD];    /* lowercase word, and */
    int cap, count;         /* if it appeast capitalized, and its count */
} words_t;

char *strlwr (char *str)    /* no need for unsigned char */
{
    char *p = str;

    while (*p) {
        *p = tolower(*p);
        p++;
    }

    return str;
}

int main (void) {

    FILE *fptr;
    char path[PATH_MAX], word[MAX_WORD];
    size_t i, len, index = 0, max_words = MAX_WORDS;

    /* pointer to allocated block of max_words struct initialized zero */
    words_t *words = calloc (max_words, sizeof *words);
    if (!words) {   /* valdiate every allocation */
        perror ("calloc-words");
        exit (EXIT_FAILURE);
    }

    /* Input file path */
    printf ("Enter file path: ");
    if (scanf ("%s", path) != 1) {  /* validate every input */
        fputs ("error: invalid file path or cancellation.\n", stderr);
        return 1;
    }

    fptr = fopen (path, "r");   /* open file */
    if (fptr == NULL) {         /* validate file open */
        fputs ( "Unable to open file.\n"
                "Please check you have read privileges.\n", stderr);
        exit (EXIT_FAILURE);
    }

    while (fscanf (fptr, "%s", word) == 1) {  /* while valid word read */
        int iscap = 0, isunique = 1;    /* is captial, is unique flags */

        if (isupper (*word))            /* is the word uppercase */
            iscap = 1;

        /* remove all trailing punctuation characters */
        len = strlen (word);                    /* get length */
        while (len && ispunct(word[len - 1]))   /* only if len > 0 */
            word[--len] = 0;

        strlwr (word);                  /* convert word to lowercase */

        /* check if word exits in list of all distinct words */
        for (i = 0; i < index; i++) {
            if (strcmp(words[i].word, word) == 0) {
                isunique = 0;               /* set unique flag zero */
                if (iscap)                  /* if capital flag set */
                    words[i].cap = iscap;   /* set capital flag in struct */
                words[i].count++;           /* increment word count */
                break;                      /* bail - done */
            }
        }
        if (isunique) { /* if unique, add to array, increment index */
            if (index == max_words) {       /* is realloc needed? */
                /* always use a temporary pointer with realloc */
                void *tmp = realloc (words, 2 * max_words * sizeof *words);
                if (!tmp) { /* validate every allocation */
                    perror ("realloc-words");
                    break;  /* don't exit, original data still valid */
                }
                words = tmp;    /* assign reallocated block to words */
                /* (optional) set all new memory to zero */
                memset (words + max_words, 0, max_words * sizeof *words);
                max_words *= 2; /* update max_words to reflect new limit */
            }
            memcpy (words[index].word, word, len + 1);  /* have len */
            if (iscap)                      /* if cap flag set */
                words[index].cap = iscap;   /* set capital flag in struct */
            words[index++].count++;         /* increment count & index */
        }
    }
    fclose (fptr);  /* close file */

    /*
     * Print occurrences of all words in file.
     */
    puts ("\nOccurrences of all distinct words with Cap in file:");
    for (i = 0; i < index; i++) {
        if (words[i].cap) {
            strcpy (word, words[i].word);
            *word = toupper (*word);
            /*
             * %-15s prints string in 15 character width.
             * - is used to print string left align inside
             * 15 character width space.
             */
            printf("%-8d %s\n", words[i].count, word);
        }
    }
    free (words);

    return 0;
}
Run Code Online (Sandbox Code Playgroud)

要测试的文本文件:(cars.txt)

A car (or automobile) is a wheeled motor vehicle used for transportation. Most definitions of car say they run primarily on roads, seat one to eight people, have four tires, and mainly transport people rather than goods.[2][3]

Cars came into global use during the 20th century, and developed economies depend on them. The year 1886 is regarded as the birth year of the modern car when German inventor Karl Benz patented his Benz Patent-Motorwagen. Cars became widely available in the early 20th century. One of the first cars accessible to the masses was the 1908 Model T, an American car manufactured by the Ford Motor Company. Cars were rapidly adopted in the US, where they replaced animal-drawn carriages and carts, but took much longer to be accepted in Western Europe and other parts of the world.

Cars have controls for driving, parking, passenger comfort, and a variety of lights. Over the decades, additional features and controls have been added to vehicles, making them progressively more complex. These include rear reversing cameras, air conditioning, navigation systems, and in-car entertainment. Most cars in use in the 2010s are propelled by an internal combustion engine, fueled by the combustion of fossil fuels. Electric cars, which were invented early in the history of the car, began to become commercially available in 2008.

There are costs and benefits to car use. The costs include acquiring the vehicle, interest payments (if the car is financed), repairs and maintenance, fuel, depreciation, driving time, parking fees, taxes, and insurance.[4] The costs to society include maintaining roads, land use, road congestion, air pollution, public health, health care, and disposing of the vehicle at the end of its life. Road traffic accidents are the largest cause of injury-related deaths worldwide.[5]

The benefits include on-demand transportation, mobility, independence, and convenience.[6] The societal benefits include economic benefits, such as job and wealth creation from the automotive industry, transportation provision, societal well-being from leisure and travel opportunities, and revenue generation from the taxes. People's ability to move flexibly from place to place has far-reaching implications for the nature of societies.[7] There are around 1 billion cars in use worldwide. The numbers are increasing rapidly, especially in China, India and other newly industrialized countries.[8]
Run Code Online (Sandbox Code Playgroud)

电流输出:

Occurrences of all distinct words with Cap in file:
3        A
2        Motor
2        Most
2        One
8        Cars
29       The
1        German
1        Karl
2        Benz
1        Patent-motorwagen
1        Model
1        T
1        American
1        Ford
1        Company
1        Us
1        Western
1        Europe
1        Over
1        These
1        Electric
2        There
2        Road
1        People's
1        China
1        India
Run Code Online (Sandbox Code Playgroud)

预期输出:(仅示例)

2        Motor
1        German
1        Karl
2        Benz
1        Patent-motorwagen
1        Model
1        T
1        American
1        Ford
1        Company
Run Code Online (Sandbox Code Playgroud)

编辑更新:可能的解决方案:

  • 然后继续(无效)

    // skip the word if it is a common word
    for (int i = 0; i < NUMBER_OF_STRING; i++) {
        if (strcmp(word, commonWords[i])==0) {
            continue;
        }
    }
    
    Run Code Online (Sandbox Code Playgroud)
  • Dav*_*ica 3

    稍微更有效的方法是使用一次调用,strstr而不是尝试与前 100 个最常见单词中的每一个进行比较。由于您知道 100 个最常用的单词,并且它们不会改变,因此您可以轻松确定最长的单词是 7 个字符。换句话说,您只需测试是否word是最常见的之一(如果小于):

    #define TOP_LEN       8     /* longest string in TOP100 + nul-character */
    
    Run Code Online (Sandbox Code Playgroud)

    由于文字没有改变,您可以继续:

    const char TOP100[] = " the be to of and a in that have i it for not on with"
                    " he as you do at this but his by from they we say her she or"
                    " an will my one all would there their what so up out if about"
                    " who get which go me when make can like time no just him know"
                    " take people into year your good some could them see other"
                    " than then now look only come its over think also back after"
                    " use two how our work first well way even new want because"
                    " any these give day most us ";
    
    Run Code Online (Sandbox Code Playgroud)

    注意:每个单词之前spacespace之后允许您通过在单词两侧添加空格来创建teststr搜索。已转换为小写以便在您之后使用)strstr'I'strlwr (word);

    另请注意:您也可以使用常量文字#define TOP100 " the ... us ",但它会严重地换行并滚动到页面之外 - 由您决定)

    对于 100 个最常见单词的常量字符串,唯一需要添加的是:

            ...
            strlwr (word);                  /* convert word to lowercase */
    
            /* check against 100 most common words (TOP100) */
            if (len < TOP_LEN) {                    /* word less than TOP_LEN? */
                char teststr[TOP_LEN * 2];          /* buffer for " word " */
                sprintf (teststr, " %s ", word);    /* create teststr */
                if (strstr (TOP100, teststr))       /* check if in TOP100 */
                    continue;                       /* if so, get next word */
            }
            ...
    
    Run Code Online (Sandbox Code Playgroud)

    如上所示,您检查该单词是否为 7 个字符或更少(否则无需检查最常见的字符)。然后声明 ateststr来保存字符串,两端各有一个空格。(由于最长的常见单词是 7 个字符,那么 7 个字符加上 2 个空格就是 9 个字符,再加上nul 字符就是10 个字符,所以这里 16 个字符就足够了。)

    sprintf只需一个简单的调用即可在 的两端添加空格word,然后strstr只需调用一次即可查看是否word在前 100 个最常见的单词之内。如果是,则无需进一步,只需continue获取下一个单词即可。

    将其完全放入您的代码中,您将得到:

    /**
     * C program to count occurrences of all words in a file.
     */
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <ctype.h>
    #include <limits.h>
    
    #define MAX_WORD  20000     /* max word size */
    #define MAX_WORDS     8     /* initial number of struct to allocate */
    #define TOP_LEN       8     /* longest string in TOP100 */
    
    #ifndef PATH_MAX
    #define PATH_MAX   2048     /* max path (defined for Linux in limits.h) */
    #endif
    
    const char TOP100[] = " the be to of and a in that have i it for not on with"
                    " he as you do at this but his by from they we say her she or"
                    " an will my one all would there their what so up out if about"
                    " who get which go me when make can like time no just him know"
                    " take people into year your good some could them see other"
                    " than then now look only come its over think also back after"
                    " use two how our work first well way even new want because"
                    " any these give day most us ";
    
    typedef struct {            /* use a struct to hold */
        char word[MAX_WORD];    /* lowercase word, and */
        int cap, count;         /* if it appeast capitalized, and its count */
    } words_t;
    
    char *strlwr (char *str)    /* no need for unsigned char */
    {
        char *p = str;
    
        while (*p) {
            *p = tolower(*p);
            p++;
        }
    
        return str;
    }
    
    int main (void) {
    
        FILE *fptr;
        char path[PATH_MAX], word[MAX_WORD];
        size_t i, len, index = 0, max_words = MAX_WORDS;
    
        /* pointer to allocated block of max_words struct initialized zero */
        words_t *words = calloc (max_words, sizeof *words);
        if (!words) {   /* valdiate every allocation */
            perror ("calloc-words");
            exit (EXIT_FAILURE);
        }
    
        /* Input file path */
        printf ("Enter file path: ");
        if (scanf ("%s", path) != 1) {  /* validate every input */
            fputs ("error: invalid file path or cancellation.\n", stderr);
            return 1;
        }
    
        fptr = fopen (path, "r");   /* open file */
        if (fptr == NULL) {         /* validate file open */
            fputs ( "Unable to open file.\n"
                    "Please check you have read privileges.\n", stderr);
            exit (EXIT_FAILURE);
        }
    
        while (fscanf (fptr, "%s", word) == 1) {  /* while valid word read */
            int iscap = 0, isunique = 1;    /* is captial, is unique flags */
    
            if (isupper (*word))            /* is the word uppercase */
                iscap = 1;
    
            /* remove all trailing punctuation characters */
            len = strlen (word);                    /* get length */
            while (len && ispunct(word[len - 1]))   /* only if len > 0 */
                word[--len] = 0;
    
            strlwr (word);                  /* convert word to lowercase */
    
            /* check against 100 most common words (TOP100) */
            if (len < TOP_LEN) {                    /* word less than TOP_LEN? */
                char teststr[TOP_LEN * 2];          /* buffer for " word " */
                sprintf (teststr, " %s ", word);    /* create teststr */
                if (strstr (TOP100, teststr))       /* check if in TOP100 */
                    continue;                       /* if so, get next word */
            }
    
            /* check if word exits in list of all distinct words */
            for (i = 0; i < index; i++) {
                if (strcmp(words[i].word, word) == 0) {
                    isunique = 0;               /* set unique flag zero */
                    if (iscap)                  /* if capital flag set */
                        words[i].cap = iscap;   /* set capital flag in struct */
                    words[i].count++;           /* increment word count */
                    break;                      /* bail - done */
                }
            }
            if (isunique) { /* if unique, add to array, increment index */
                if (index == max_words) {       /* is realloc needed? */
                    /* always use a temporary pointer with realloc */
                    void *tmp = realloc (words, 2 * max_words * sizeof *words);
                    if (!tmp) { /* validate every allocation */
                        perror ("realloc-words");
                        break;  /* don't exit, original data still valid */
                    }
                    words = tmp;    /* assign reallocated block to words */
                    /* (optional) set all new memory to zero */
                    memset (words + max_words, 0, max_words * sizeof *words);
                    max_words *= 2; /* update max_words to reflect new limit */
                }
                memcpy (words[index].word, word, len + 1);  /* have len */
                if (iscap)                      /* if cap flag set */
                    words[index].cap = iscap;   /* set capital flag in struct */
                words[index++].count++;         /* increment count & index */
            }
        }
        fclose (fptr);  /* close file */
    
        /*
         * Print occurrences of all words in file.
         */
        puts ("\nOccurrences of all distinct words with Cap in file:");
        for (i = 0; i < index; i++) {
            if (words[i].cap) {
                strcpy (word, words[i].word);
                *word = toupper (*word);
                /*
                 * %-15s prints string in 15 character width.
                 * - is used to print string left align inside
                 * 15 character width space.
                 */
                printf("%-8d %s\n", words[i].count, word);
            }
        }
        free (words);
    
        return 0;
    }
    
    Run Code Online (Sandbox Code Playgroud)

    使用/输出示例

    与上次的情况一样,您的预期输出:(仅示例)是错误的,因为您的代码中没有任何内容可以删除plurals所有格复数所有格,因此文件的输出cars.txt将是:

    $ ./bin/unique_words_exclude_top_100
    Enter file path: dat/cars.txt
    
    Occurrences of all distinct words with Cap in file:
    2        Motor
    8        Cars
    1        German
    1        Karl
    2        Benz
    1        Patent-motorwagen
    1        Model
    1        T
    1        American
    1        Ford
    1        Company
    1        Western
    1        Europe
    1        Electric
    2        Road
    1        People's
    1        China
    1        India
    
    Run Code Online (Sandbox Code Playgroud)

    检查一下,如果您还有其他问题,请告诉我。