Files

662 lines
21 KiB
C

/*
This program is part of the TACLeBench benchmark suite.
Version 2.0
Name: anagram
Author: Raymond Chen
Function: A program that computes anagrams.
Source: See below.
Original name: anagram
Changes: See ChangeLog.txt
License: See below.
*/
/*
Anagram program by Raymond Chen,
inspired by a similar program by Brian Scearce
This program is Copyright 1991 by Raymond Chen.
(rjc@math.princeton.edu)
This program may be freely distributed provided all alterations
to the original are clearly indicated as such.
*/
/* There are two tricks. First is the Basic Idea:
When the user types in a phrase, the phrase is first preprocessed to
determine how many of each letter appears. A bit field is then constructed
dynamically, such that each field is large enough to hold the next power
of two larger than the number of times the character appears. For example,
if the phrase is hello, world, the bit field would be
00 00 00 000 000 00 00
d e h l o r w
The phrase hello, world, itself would be encoded as
01 01 01 011 010 01 01
d e h l o r w
and the word hollow would be encoded as
00 00 01 010 010 00 01
d e h l o r w
The top bit of each field is set in a special value called the sign.
Here, the sign would be
10 10 10 100 100 10 10
d e h l o r w
The reason for packing the values into a bit field is that the operation
of subtracting out the letters of a word from the current phrase can be
carried out in parallel. for example, subtracting the word hello from
the phrase hello, world, is merely
d e h l o r w
01 01 01 011 010 01 01 (dehllloorw)
- 00 00 01 010 010 00 01 (hlloow)
========================
01 01 00 001 000 01 00 (delr)
Since none of the sign bits is set, the word fits, and we can continue.
Suppose the next word we tried was hood.
d e h l o r w
01 01 00 001 000 01 00 (delr)
- 01 00 01 000 010 00 00 (hood)
========================
00 00 11 000 110 01 00
^ ^
A sign bit is set. (Two, actually.) This means that hood does not
fit in delr, so we skip it and try another word. (Observe that
when a sign bit becomes set, it screws up the values for the letters to
the left of that bit, but that's not important.)
The inner loop of an anagram program is testing to see if a
word fits in the collection of untried letters. Traditional methods
keep an array of 26 integers, which are then compared in turn. This
means that there are 26 comparisons per word.
This method reduces the number of comparisons to MAX_QUAD, typically 2.
Instead of looping through an array, we merely perform the indicated
subtraction and test if any of the sign bits is set.
*/
/* The nuts and bolts:
The dictionary is loaded and preprocessed. The preprocessed dictionary
is a concatenation of copies of the structure:
struct dictword {
char bStructureSize; -- size of this structure
char cLetters; -- number of letters in the word
char achWord[]; -- the word itself (0-terminated)
}
Since this is a variable-sized structure, we keep its size in the structure
itself for rapid stepping through the table.
When a phrase is typed in, it is first preprocessed as described in the
Basic Idea. We then go through the dictionary, testing each word. If
the word fits in our phrase, we build the bit field for its frequency
table and add it to the list of candidates.
*/
/*
The Second Trick:
Before diving into our anagram search, we then tabulate how many times
each letter appears in our list of candidates, and sort the table, with
the rarest letter first.
We then do our anagram search.
Like most anagram programs, this program does a depth-first search.
Although most anagram programs do some sort of heuristics to decide what
order to place words in the list_of_candidates, the search itself proceeds
according to a greedy algorithm. That is, once you find a word that fits,
subtract it and recurse.
This anagram program exercises some restraint and does not march down
every branch that shows itself. Instead, it only goes down branches
that use the rarest unused letter. This helps to find dead ends faster.
FindAnagram(unused_letters, list_of_candidates) {
l = the rarest letter as yet unused
For word in list_of_candidates {
if word does not fit in unused_letters, go on to the next word.
if word does not contain l, defer.
FindAnagram(unused_letters - word, list_of_candidates[word,...])
}
}
The heuristic of the Second Trick can probably be improved. I invite
anyone willing to improve it to do so.
*/
/* Before compiling, make sure Quad and MASK_BITS are set properly. For best
results, make Quad the largest integer size supported on your machine.
So if your machine has long longs, make Quad an unsigned long long.
(I called it Quad because on most machines, the largest integer size
supported is a four-byte unsigned long.)
If you need to be able to anagram larger phrases, increase MAX_QUADS.
If you increase it beyond 4, you'll have to add a few more loop unrolling
steps to FindAnagram.
*/
#include "anagram_ctype.h"
#include "anagram_stdlib.h"
#include "anagram_strings.h"
#include "anagram_compare.h"
/*
Defines
*/
// Wasm loop bounds
#include "anagram_input.c"
#include "anagram_stdlib.c"
__attribute__((import_module("__pragma"), import_name("loopbound"))) extern void
__pragma_loopbound(unsigned int min_bound, unsigned int max_bound);
#define anagram_DICTWORDS 2279
#define anagram_MASK_BITS 32 /* number of bits in a Quad */
#define anagram_MAX_QUADS 2 /* controls largest phrase */
#define anagram_MAXCAND 100 /* candidates */
#define anagram_MAXSOL 51 /* words in the solution */
#define anagram_ALPHABET 26 /* letters in the alphabet */
#define anagram_OneStep(i) \
if ((aqNext[i] = pqMask[i] - pw->aqMask[i]) & anagram_aqMainSign[i]) { \
ppwStart++; \
continue; \
}
/*
Type definitions
*/
typedef unsigned int anagram_Quad; /* for building our bit mask */
/* A Word remembers the information about a candidate word. */
typedef struct {
char *pchWord; /* the word itself */
anagram_Quad aqMask[anagram_MAX_QUADS]; /* the word's mask */
unsigned cchLength; /* letters in the word */
char padding[4];
} anagram_Word;
typedef anagram_Word *anagram_PWord;
typedef anagram_Word **anagram_PPWord;
/* A Letter remembers information about each letter in the phrase to
be anagrammed. */
typedef struct {
unsigned uFrequency; /* how many times it appears */
unsigned uShift; /* how to mask */
unsigned uBits; /* the bit mask itself */
unsigned iq; /* which Quad to inspect? */
} anagram_Letter;
typedef anagram_Letter *anagram_PLetter;
/*
Forward declaration of functions
*/
__attribute__((always_inline)) static inline void anagram_init(void);
__attribute__((noinline)) __attribute__((export_name("entrypoint")))
__attribute__((noinline)) __attribute__((export_name("entrypoint"))) void
anagram_main(void);
__attribute__((always_inline)) static inline int anagram_return(void);
__attribute__((always_inline)) static inline int anagram_ch2i(int ch);
__attribute__((always_inline)) static inline void anagram_AddWords(void);
__attribute__((always_inline)) static inline void
anagram_BuildMask(char const *pchPhrase);
__attribute__((always_inline)) static inline void
anagram_BuildWord(char *pchWord);
__attribute__((always_inline)) static inline void anagram_DumpWords(void);
__attribute__((always_inline)) static inline void
anagram_FindAnagram(anagram_Quad *pqMask, anagram_PPWord ppwStart, int iLetter);
__attribute__((always_inline)) static inline anagram_PWord
anagram_NewWord(void);
__attribute__((always_inline)) static inline anagram_PWord
anagram_NextWord(void);
__attribute__((always_inline)) static inline void anagram_ReadDict(void);
__attribute__((always_inline)) static inline void anagram_Reset(void);
__attribute__((always_inline)) static inline void anagram_SortCandidates(void);
/*
Declaration of global variables
*/
extern char const *anagram_achPhrase[3];
extern char const *anagram_dictionary[anagram_DICTWORDS];
/* candidates we've found so far */
static anagram_PWord anagram_apwCand[anagram_MAXCAND];
/* how many of them? */
static unsigned anagram_cpwCand;
/* statistics on the current phrase */
static anagram_Letter anagram_alPhrase[anagram_ALPHABET];
/* number of letters in phrase */
static int anagram_cchPhraseLength;
/* the bit field for the full phrase */
static anagram_Quad anagram_aqMainMask[anagram_MAX_QUADS];
/* where the sign bits are */
static anagram_Quad anagram_aqMainSign[anagram_MAX_QUADS];
static const int anagram_cchMinLength = 3;
/* auGlobalFrequency counts the number of times each letter appears,
summed over all candidate words. This is used to decide which letter
to attack first. */
static unsigned anagram_auGlobalFrequency[anagram_ALPHABET];
static int anagram_achByFrequency[anagram_ALPHABET]; /* for sorting */
/* the dictionary is read here */
static char *anagram_pchDictionary;
/* the answers */
static anagram_PWord anagram_apwSol[anagram_MAXSOL];
static int anagram_cpwLast;
/* buffer to write an answer */
static char anagram_buffer[30];
/*
Initialization- and return-value-related functions
*/
/* ReadDict -- read the dictionary file into memory and preprocess it
A word of length cch in the dictionary is encoded as follows:
byte 0 = cch + 3
byte 1 = number of letters in the word
byte 2... = the word itself, null-terminated
Observe that cch+3 is the length of the total encoding. These
byte streams are concatenated, and terminated with a 0.
*/
__attribute__((always_inline)) static inline void
anagram_ReadDict(void) {
char *pch;
char *pchBase;
unsigned len;
unsigned cWords = 0;
unsigned cLetters;
int i;
volatile char bitmask = 0;
len = 0;
__pragma_loopbound(2279, 2279);
for (i = 0; i < anagram_DICTWORDS; i++) {
unsigned strlen = 0;
__pragma_loopbound(1, 5);
while (anagram_dictionary[i][strlen] != 0)
strlen++;
len += strlen + 3;
}
len++;
pchBase = anagram_pchDictionary = (char *) anagram_malloc(len);
__pragma_loopbound(2279, 2279);
for (i = 0; i < anagram_DICTWORDS; i++) {
int index = 0;
pch = pchBase + 2; /* reserve for length */
cLetters = 0;
__pragma_loopbound(1, 5);
while (anagram_dictionary[i][index] != '\0') {
if (anagram_isalpha(anagram_dictionary[i][index]))
cLetters++;
*pch++ = anagram_dictionary[i][index];
index++;
*(pch - 1) ^= bitmask;
}
*pch++ = '\0';
*pchBase = (char) (pch - pchBase);
pchBase[1] = (char) cLetters;
pchBase = pch;
cWords++;
}
*pchBase++ = 0;
}
__attribute__((always_inline)) static inline void
anagram_init(void) {
anagram_ReadDict();
}
__attribute__((always_inline)) static inline int
anagram_return(void) {
int i;
char const *answer = "duke yip arm";
_Pragma(
"loopbound min 12 max 12") for (i = 0; i < 12;
i++) if (answer[i] !=
anagram_buffer[i]) return 1;
return 0;
}
/*
Core benchmark functions
*/
/* convert letter to index */
__attribute__((always_inline)) static inline int
anagram_ch2i(int ch) {
return ch - 'a';
}
__attribute__((always_inline)) static inline int
anagram_CompareFrequency(char *pch1, char *pch2) {
return anagram_auGlobalFrequency[(*(int *) pch1)] <
anagram_auGlobalFrequency[(*(int *) pch2)]
? -1
: anagram_auGlobalFrequency[(*(int *) pch1)] ==
anagram_auGlobalFrequency[(*(int *) pch2)]
? 0
: 1;
}
__attribute__((always_inline)) static inline void
anagram_Reset(void) {
anagram_bzero((char *) anagram_alPhrase,
sizeof(anagram_Letter) * anagram_ALPHABET);
anagram_bzero((char *) anagram_aqMainMask,
sizeof(anagram_Quad) * anagram_MAX_QUADS);
anagram_bzero((char *) anagram_aqMainSign,
sizeof(anagram_Quad) * anagram_MAX_QUADS);
anagram_bzero((char *) anagram_auGlobalFrequency,
sizeof(unsigned) * anagram_ALPHABET);
anagram_bzero((char *) anagram_achByFrequency,
sizeof(int) * anagram_ALPHABET);
anagram_bzero((char *) anagram_apwCand,
sizeof(anagram_PWord) * anagram_MAXCAND);
anagram_cchPhraseLength = 0;
anagram_cpwCand = 0;
}
__attribute__((always_inline)) static inline void
anagram_BuildMask(char const *pchPhrase) {
int i;
int ch;
unsigned iq; /* which Quad? */
unsigned int cbtUsed; /* bits used in the current Quad */
unsigned int cbtNeed; /* bits needed for current letter */
anagram_Quad qNeed; /* used to build the mask */
/* Tabulate letter frequencies in the phrase */
anagram_cchPhraseLength = 0;
__pragma_loopbound(11, 11);
while ((ch = *pchPhrase++) != '\0') {
if (anagram_isalpha(ch)) {
ch = anagram_tolower(ch);
anagram_alPhrase[anagram_ch2i(ch)].uFrequency++;
anagram_cchPhraseLength++;
}
}
/* Build masks */
iq = 0; /* which quad being used */
cbtUsed = 0; /* bits used so far */
__pragma_loopbound(26, 26);
for (i = 0; i < anagram_ALPHABET; i++) {
if (anagram_alPhrase[i].uFrequency == 0) {
anagram_auGlobalFrequency[i] = ~0u; /* to make it sort last */
} else {
anagram_auGlobalFrequency[i] = 0u;
_Pragma(
"loopbound min 1 max 2") for (cbtNeed = 1, qNeed = 1;
anagram_alPhrase[i].uFrequency >=
qNeed;
cbtNeed++, qNeed <<= 1);
if (cbtUsed + cbtNeed > anagram_MASK_BITS)
cbtUsed = 0;
anagram_alPhrase[i].uBits = qNeed - 1;
if (cbtUsed)
qNeed <<= cbtUsed;
anagram_aqMainSign[iq] |= qNeed;
anagram_aqMainMask[iq] |=
(anagram_Quad) anagram_alPhrase[i].uFrequency << cbtUsed;
anagram_alPhrase[i].uShift = cbtUsed;
anagram_alPhrase[i].iq = iq;
cbtUsed += cbtNeed;
}
}
}
__attribute__((always_inline)) static inline anagram_PWord
anagram_NewWord(void) {
anagram_PWord pw;
pw = (anagram_Word *) anagram_malloc(sizeof(anagram_Word));
return pw;
}
/* NextWord -- get another candidate entry, creating if necessary */
__attribute__((always_inline)) static inline anagram_PWord
anagram_NextWord(void) {
anagram_PWord pw;
pw = anagram_apwCand[anagram_cpwCand++];
if (pw != 0)
return pw;
anagram_apwCand[anagram_cpwCand - 1] = anagram_NewWord();
return anagram_apwCand[anagram_cpwCand - 1];
}
/* BuildWord -- build a Word structure from an ASCII word
If the word does not fit, then do nothing. */
__attribute__((always_inline)) static inline void
anagram_BuildWord(char *pchWord) {
unsigned char cchFrequency[anagram_ALPHABET];
int i;
char *pch = pchWord;
anagram_PWord pw;
unsigned int cchLength = 0;
anagram_bzero((char *) cchFrequency,
sizeof(unsigned char) * anagram_ALPHABET);
/* Build frequency table */
__pragma_loopbound(3, 5);
while ((i = *pch++) != '\0') {
if (!anagram_isalpha(i))
continue;
i = anagram_ch2i(anagram_tolower(i));
if (++cchFrequency[i] > anagram_alPhrase[i].uFrequency)
return;
++cchLength;
}
/* Update global count */
__pragma_loopbound(26, 26);
for (i = 0; i < anagram_ALPHABET; i++)
anagram_auGlobalFrequency[i] += cchFrequency[i];
/* Create a Word structure and fill it in, including building the
bitfield of frequencies. */
pw = anagram_NextWord();
anagram_bzero((char *) (pw->aqMask),
sizeof(anagram_Quad) * anagram_MAX_QUADS);
pw->pchWord = pchWord;
pw->cchLength = cchLength;
__pragma_loopbound(26, 26);
for (i = 0; i < anagram_ALPHABET; i++) {
pw->aqMask[anagram_alPhrase[i].iq] |= (anagram_Quad) cchFrequency[i]
<< anagram_alPhrase[i].uShift;
}
}
/* AddWords -- build the list of candidates */
__attribute__((always_inline)) static inline void
anagram_AddWords(void) {
char *pch = anagram_pchDictionary; /* walk through the dictionary */
anagram_cpwCand = 0;
__pragma_loopbound(1967, 1967);
while (*pch) {
if ((pch[1] >= anagram_cchMinLength &&
pch[1] + anagram_cchMinLength <= anagram_cchPhraseLength) ||
pch[1] == anagram_cchPhraseLength)
anagram_BuildWord(pch + 2);
pch += *pch;
}
}
__attribute__((always_inline)) static inline void
anagram_DumpWords(void) {
int i, j;
int offset = 0;
__pragma_loopbound(3, 3);
for (i = 0; i < anagram_cpwLast; i++) {
__pragma_loopbound(3, 5);
for (j = 0; anagram_apwSol[i]->pchWord[j] != '\0'; j++)
anagram_buffer[offset + j] = anagram_apwSol[i]->pchWord[j];
offset += j;
anagram_buffer[offset++] = ' ';
}
anagram_buffer[offset++] = '\0';
}
__attribute__((always_inline)) static inline void
anagram_FindAnagram(anagram_Quad *pqMask, anagram_PPWord ppwStart,
int iLetter) {
anagram_Quad aqNext[anagram_MAX_QUADS];
register anagram_PWord pw;
anagram_Quad qMask;
unsigned iq;
anagram_PPWord ppwEnd = &anagram_apwCand[0];
ppwEnd += anagram_cpwCand;
__pragma_loopbound(0, 6);
while (1) {
iq = anagram_alPhrase[anagram_achByFrequency[iLetter]].iq;
qMask = anagram_alPhrase[anagram_achByFrequency[iLetter]].uBits
<< anagram_alPhrase[anagram_achByFrequency[iLetter]].uShift;
if (pqMask[iq] & qMask)
break;
iLetter++;
}
__pragma_loopbound(0, 11);
while (ppwStart < ppwEnd) {
pw = *ppwStart;
#if anagram_MAX_QUADS > 0
anagram_OneStep(0);
#endif
#if anagram_MAX_QUADS > 1
anagram_OneStep(1);
#endif
#if anagram_MAX_QUADS > 2
anagram_OneStep(2);
#endif
#if anagram_MAX_QUADS > 3
anagram_OneStep(3);
#endif
#if anagram_MAX_QUADS > 4
@@"Add more unrolling steps here, please." @ @
#endif
/* If the pivot letter isn't present, defer this word until later */
if ((pw->aqMask[iq] & qMask) == 0) {
*ppwStart = *(--ppwEnd);
*ppwEnd = pw;
continue;
}
/* If we get here, this means the word fits. */
anagram_apwSol[anagram_cpwLast++] = pw;
if (anagram_cchPhraseLength -= pw->cchLength) { /* recurse */
/* The recursive call scrambles the tail, so we have to be
pessimistic. */
ppwEnd = &anagram_apwCand[0];
ppwEnd += anagram_cpwCand;
anagram_FindAnagram(&aqNext[0], ppwStart, iLetter);
} else /* found one */
anagram_DumpWords();
anagram_cchPhraseLength += pw->cchLength;
--anagram_cpwLast;
ppwStart++;
continue;
}
}
__attribute__((always_inline)) static inline void
anagram_SortCandidates(void) {
int i;
/* Sort the letters by frequency */
__pragma_loopbound(26, 26);
for (i = 0; i < anagram_ALPHABET; i++)
anagram_achByFrequency[i] = i;
anagram_qsort(anagram_achByFrequency, anagram_ALPHABET, sizeof(int));
}
__attribute__((noinline)) __attribute__((export_name("entrypoint")))
__attribute__((noinline)) __attribute__((export_name("entrypoint"))) void
anagram_main(void) {
int i;
__pragma_loopbound(3, 3);
for (i = 0; i < 3; i++) {
anagram_Reset();
anagram_BuildMask(anagram_achPhrase[i]);
anagram_AddWords();
if (anagram_cpwCand == 0 || anagram_cchPhraseLength == 0)
continue;
anagram_cpwLast = 0;
anagram_SortCandidates();
_Pragma("marker call_find")
anagram_FindAnagram(anagram_aqMainMask, anagram_apwCand, 0);
_Pragma("flowrestriction 1*anagram_FindAnagram <= 51*call_find");
}
}
/*
Main function
*/
__attribute__((noinline)) __attribute__((export_name("main")))
__attribute__((noinline)) __attribute__((export_name("main"))) int
main(void) {
anagram_init();
anagram_main();
return anagram_return();
}