Read random line from .txt file

2019-09-05 04:59发布

问题:

I'm trying to upgrade my Hangman game by reading random words from a .txt file. Thing is, I can't figure out how to read a random line from the .txt file. There are single words on every new line of the .txt file.

void ler_palavras()
{
    FILE *words;

    if ((words = fopen("words.txt", "r")) == NULL) {
        printf("Error! opening file");
        exit(1);
    }

    // reads text until newline
    fscanf(words,"%[^\n]", word);
    fclose(words);
}

回答1:

If, for some reason, you can't just load the whole set of lines into memory (too big or whatever), there is a way to select a random entry from a streaming set of entries. It won't scale indefinitely, and it will exhibit small biases, but this is a game, not cryptography, so that shouldn't be a dealbreaker.

The logic is:

  1. Declare a buffer to hold the word
  2. Open the file
  3. For each line:
    • Increment a counter indicating which line you're on
    • Generate a random double (e.g. with drand48 or whatever PRNG facilities are available to you)
    • If 1.0 / lineno > randval, replace the currently stored word with the word from the current line (so the first line is auto stored, the second line is 50% likely to replace it, the third is 33% likely to do so, etc.)
  4. When you run out of lines, whatever is stored in word is your selection

Assuming the number of lines is small enough (and the range of doubles produced by your PRNG is fine-grained enough), this gives as close as possible to an equal likelihood of any given line being selected; for two lines, each has a 50/50 shot, for three, 33.33...%, etc.

I lack a C compiler right now, but the basic code would look like:

/* Returns a random line (w/o newline) from the file provided */
char* choose_random_word(const char *filename) {
    FILE *f;
    size_t lineno = 0;
    size_t selectlen;
    char selected[256]; /* Arbitrary, make it whatever size makes sense */
    char current[256];
    selected[0] = '\0'; /* Don't crash if file is empty */

    f = fopen(filename, "r"); /* Add your own error checking */
    while (fgets(current, sizeof(current), f)) {
        if (drand48() < 1.0 / ++lineno) {
            strcpy(selected, current);
        }
    }
    fclose(f);
    selectlen = strlen(selected);
    if (selectlen > 0 && selected[selectlen-1] == '\n') {
        selected[selectlen-1] = '\0';
    }
    return strdup(selected);
}


回答2:

rand() has its limitations including only generating values 0 to RAND_MAX and a file can have many times RAND_MAX lines. Assuming the line count is on the order of RAND_MAX/10 or less, the following will meet OP's goals.

Perform one pass to count the number of lines. --> lc

For each random line needed, re-read the file's lines, from beginning up the line index before some random number in the [0... lc-1] range.

Then simply read and print that line. No need for a line buffer. The file is the line buffer. The code re-uses Line_Count() for both the total line count calculation and for reading until the nth line.

#include <assert.h>
#include <ctype.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>

// Return line count, but stop once the count exceeds a maximum
int Line_Count(FILE *istream, int line_index) {
  int lc = 0;
  int previous = '\n';
  int ch;
  rewind(istream);
  while (line_index > 0 && (ch = fgetc(istream)) != EOF) {
    if (ch == '\n') {
      line_index--;
    }
    if (previous == '\n') {
      lc++;
    }
    previous = ch;
  }
  return lc;
}

void print_random_line(FILE *istream, int line_index) {
  printf("%8d: <", line_index + 1);
  Line_Count(istream, line_index);
  int ch;
  while ((ch = fgetc(istream)) != EOF && ch != '\n') {
    if (isprint(ch)) {
      putchar(ch);
    }
  }
  printf(">\n");
}

int main() {
  srand((unsigned) time(NULL));
  FILE *istream = fopen("test.txt", "r");
  assert(istream);
  int lc = Line_Count(istream, RAND_MAX);
  assert(lc && lc < RAND_MAX);
  for (int i = 0; i < 5; i++) {
    print_random_line(istream, rand() % lc);
  }
  fclose(istream);
}


回答3:

Here is another solution, still limited by RAND_MAX, that doesn't require reading each line up to the chosen line. The idea is to use a binary file that stores each word in the same number of bytes, so any word can be accessed by using fseek() and fread(). The first entry in the file is a long value that stores the number of words in the file. When words are added, this value is updated.

Here is an implementation that looks for an ordinary text file called wordlist.txt, which has one word on each line. If found, the program updates (or creates, if necessary) a file called wordlist.fmt. The update function reads in each word from the text file, skipping blank lines, and stores it in the binary file in a fixed number of bytes. After reading in all of the words, the word count is updated. After running the program once with a text file, you should remove the text file, or the next run will add the words again. The .fmt file should stay, and if you want to add more words, just put a new text file in the directory with the executable and run it again.

The loop that prints five random words generates a random number, uses that number to move to a file position containing a word, reads that word into an array, and prints it.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define RAW_WORDS  "wordlist.txt"
#define FMT_WORDS  "wordlist.fmt"
#define OFFSET_SZ  (sizeof(long))
#define MAXWORD    30

void update_words(FILE *fp_fmt, FILE *fp_raw);
void strip(char *str);

int main(void)
{
    FILE *raw_words, *formatted_words;
    char word[MAXWORD];
    long wordcount;
    int i;
    int wpos;

    raw_words = fopen(RAW_WORDS, "r");

    /* Try to open existing file for update, otherwise open new file */ 
    if ((formatted_words = fopen(FMT_WORDS, "r+b")) == NULL){
        if ((formatted_words = fopen(FMT_WORDS, "w+b")) == NULL) {
            fprintf(stderr, "Unable to open file %s\n", FMT_WORDS);
            exit(EXIT_FAILURE);
        } else {                    // initialize file wordcount
            wordcount = 0L;
            fwrite(&wordcount, OFFSET_SZ, 1, formatted_words);
            fflush(formatted_words);
        }
    }

    /* Update FMT_WORDS file if RAW_WORDS is present */
    if (raw_words != NULL)
        update_words(formatted_words, raw_words);

    /* Get 5 random words and print them */
    srand((unsigned)time(NULL));

    rewind(formatted_words);
    fread(&wordcount, OFFSET_SZ, 1, formatted_words);

    printf("Five random words from %s:\n", FMT_WORDS);
    for (i = 0; i < 5; i++) {
        wpos = rand() % wordcount;
        fseek(formatted_words, wpos * MAXWORD + OFFSET_SZ, SEEK_SET);
        fread(word, MAXWORD, 1, formatted_words);
        puts(word);
    }

    if (raw_words && (fclose(raw_words) != 0))
        fprintf(stderr, "Unable to close file %s\n", RAW_WORDS);
    if (fclose(formatted_words) != 0)
        fprintf(stderr, "Unable to close file %s\n", FMT_WORDS);

    return 0;
}

void update_words(FILE *fp_fmt, FILE *fp_raw)
{
    char word[MAXWORD];
    long wordcount;

    /* Read in wordcount and move to end of file */
    rewind(fp_fmt);
    fread(&wordcount, OFFSET_SZ, 1, fp_fmt);
    fseek(fp_fmt, wordcount * MAXWORD, SEEK_CUR);

    /* Write formatted words, skipping blank lines */
    while (fgets(word, MAXWORD, fp_raw) != NULL) {
        if (word[0] != '\n') {
            strip(word);
            if (fwrite(word, MAXWORD, 1, fp_fmt) != 1) {
                fprintf(stderr, "Error writing to %s\n", FMT_WORDS);
                exit(EXIT_FAILURE);
            }
            ++wordcount;
        }
    }

    /* Update wordcount in file and flush output */
    rewind(fp_fmt);
    fwrite(&wordcount, OFFSET_SZ, 1, fp_fmt);
    fflush(fp_fmt);
}

void strip(char *str)
{
    while (*str != '\n' && *str != '\0')
        str++;
    *str = '\0';
}


标签: c file random