| Bytes | Lang | Time | Link |
|---|---|---|---|
| 065 | AWK | 250916T160647Z | xrs |
| 176 | Go | 230328T180037Z | bigyihsu |
| 101 | q | 230328T211031Z | eohara-i |
| 008 | Stax | 230328T145211Z | emirps |
| 249 | C gcc | 230303T045404Z | ErikF |
| 103 | Lua | 230305T125347Z | GulgDev |
| 034 | Python 3 | 230305T124340Z | GulgDev |
| nan | C GCC | 230303T131920Z | Peter |
| 050 | jq | 230228T195820Z | GammaFun |
| 031 | Mathematica Wolfram Language | 230303T203503Z | dirvine |
| 046 | Factor | 230228T183440Z | chunes |
| 007 | Vyxal | 230303T183050Z | Robert |
| 087 | PHP 8.x | 230302T221227Z | Ismael M |
| 014 | Japt | 230228T174935Z | Shaggy |
| 024 | Perl p | 230228T215752Z | naffetS |
| 021 | Raku | 230301T220245Z | Sean |
| 083 | Java JDK | 230228T211117Z | Unmitiga |
| 032 | Ruby n | 230301T112133Z | Kirill L |
| 031 | Charcoal | 230301T100130Z | Neil |
| 015 | Retina 0.8.2 | 230228T174039Z | Neil |
| 012 | 05AB1E | 230301T084317Z | Kevin Cr |
| 027 | APL Dyalog Extended | 230301T044018Z | Programm |
| 120 | Excel | 230228T214502Z | Jos Wool |
| 054 | Python | 230228T215429Z | naffetS |
| 007 | Vyxal | 230228T212749Z | AndrovT |
| 023 | J | 230228T204509Z | south |
| 035 | Arturo | 230228T201657Z | chunes |
| nan | 230228T200416Z | The Thon | |
| 049 | PowerShell Core | 230228T193939Z | Julian |
| 036 | Zsh | 230228T191829Z | GammaFun |
| 046 | JavaScript | 230228T175920Z | Shaggy |
| 028 | Bash + GNU utils | 230228T174407Z | Digital |
AWK, 65 bytes
BEGIN{FPAT="_*[A-z]+_*"}{for(;i++<NF;)!b[tolower($i)]++&&x++}$0=x
Go, 181 176 bytes
import(."regexp";."strings")
func f(s string)int{m:=make(map[string]int)
for _,w:=range MustCompile("\\w+").FindAllString(ToLower(s),-1){if _,o:=m[w];!o{m[w]=1}}
return len(m)}
Gets all words matching regex \w+, and adds them to a map (acting as a set in this case). Then it returns the number of items in the map (set).
- -5 bytes by @The Thonnu
q, 101 bytes
{(#:)(&:)min each not(^:)(?:){$[not(#:)(&:)x in .Q.an;0N;x]}each cut[(&:)0<>(-':)x in .Q.an;lower x]}
More verbose version:
{count where min each not null distinct {$[not count where x in .Q.an;" ";x]}each cut[where 0<>deltas x in .Q.an;lower x]}
Stax, 8 bytes
│ÿîIΔ»╝H
This is a packed stax program. When unpacked, it is the following:
v"\w+"|Fu%
Explanation
v # lowercase the input string
|F # get all regex pattern matches of regex
"\w+" # \w+
u # uniquify
% # length
C (gcc), 275 254 250 249 bytes
- -26 bytes thanks to ceilingcat
To split the words, each uppercased word is stored in a list recursively. Duplicates are nulled out, preventing them from being scanned.
g(s,t,i)char*s,**t;{char*a[2]={0,t},**v,*u;for(;*s&&!isalnum(*s)&&*s-95;s++);u=*a=strdup(s);if(i=*u){for(;*u=*s&&isalnum(*s)|*s==95;*u++=~32&*s++);i=g(s,a);}else for(v=t;v;v=v[1])if(*v)for(i++,t=v;t=t[1];)*t&&!strcmp(*v,*t)?*t=0:0;s=i;}f(s){g(s,0);}
Ungolfed (with a structure instead of an array):
struct list { char *data; struct list *prev; };
int g(char *s, struct list *t) {
int i;
char *u;
struct list a={0,t}, *v;
for(;*s&&!(isalnum(*s)&&*s-'_');s++);
u=a.data=strdup(s); // skip spaces and duplicate the string locally
if(i=*u){ // collect the word and uppercase it
for(;*u=*s&&isalnum(*s)|*s=='_';*u++=~32&*s++);
i=g(s,&a); // recursively generate list
}else // end of string: process the words
for(v=t;v;v=v->prev) // from the end, work backwards
if(v->data) // if not a duplicate
for(i++,t=v;t=t->prev;) // scan for duplicates
t->data&&!strcmp(v->data,t->data)?t->data=0:0; // null out duplicates
s=i; // return the count
}
int f(char *s) { g(s,0); } // initialize the end of list and (implicitly) return the count
Lua, 103 bytes
function x(i)r=0m={}for c in i:lower():gmatch("%w+")do if not m[c]then m[c]=1r=r+1 end end return r end
C (GCC), 156 + 48 + 22 = 226 bytes
-3 bytes thanks to ceilingcat
Use compiler flags -DW(a)=for(--s;a!isalnum(*++s)|*s==95;)*s|=32; and -DF(a)=for(a=0;a<i;++a).
char*v[99],*e;i,j,k;f(char*s){e=s+strlen(s);W()for(i=0;s<e;){v[i++]=s;W(!)W((*s=0)|)}F(j)F(k)strcmp(v[j],v[k])|j==k||(*v[j]=0);k=0;F(j)*v[j]&&++k;return k;}
Explanation:
char*v[99],*e;
i,j,k;
f(char*s)
{
// Set e to the end of s
e=s+strlen(s);
// Set s to the first character that is alphanumerical or an underscore.
W()
// While s hasn't moved past the end of the string ...
for(i=0;s<e;) {
// Store s in v[i], then increment i
v[i++]=s;
// Set s to the first character that isn't alphanumerical or an
// underscore.
W(!)
// Set all characters between the current word and the next to 0, and
// set s to the first character in the next word.
W((*s=0)|)
}
// Iterate through the words with j.
F(j)
// Iterate through the words with k.
F(k)
// If words at index j and k are equal but j and k aren't the same
// index, set the first character in word j to 0, marking the word
// as a duplicate.
strcmp(v[j],v[k])|j==k||(*v[j]=0);
k=0;
F(j)
// For each word that hasn't been marked as a duplicate, increment k.
*v[j]&&++k;
// Return the number of words not marked as a duplicate.
return k;
}
jq, 53 52 50 bytes
-1 byte by Neil for recognizing splitting on just \W works since we handle empty strings anyway. -2 bytes by me for realizing I could ascii_downcase first, saving a . and a |
ascii_downcase+":"|[splits("\\W")]|unique|length-1
Try it online!
Try it online!
Try it online!
Equivalent 50 byte answer:
ascii_downcase+":"|split("\\W";"")|unique|length-1
Thanks to chune's answer for inspiring me to try golfing splits("\\W+") instead of match("\\W+";"g"). Turns out, despite having to work around the empty string being matched in some cases, it is two bytes shorter!
For those curious, here's the match method:
[match("\\w+";"g").string|ascii_downcase]|unique|length
Mathematica (Wolfram Language), 31 bytes
Length@*WordCounts@*ToLowerCase
The Mathematica built-in WordCounts treats a hyphenated word as a single word. So this program is not going to work correctly on the fourth example, "hello-world2". I discuss this in the comment below.
Factor, 47 46 bytes
[ >lower R/ \W/ re-split harvest cardinality ]
-1 byte thangs to GammaFunction
>lowerconvert input to lowercaseR/ \W/ re-splitsplit on non-word charactersharvestremove empty stringscardinalitylength without duplicates
Splitting and harvesting is shorter than simply getting a list of matches because the word for that (all-matching-slices) is super long. Not sure if there is a way to prevent the empty strings in pure regex, might be shorter.
Vyxal, 7 bytes
ɽøWǍUL‹
Explanation:
ɽ - Lowercases Input
øW - Groups string by words into a list
Ǎ - Removes all non-alphabetical items, leaving empty list spaces
U - Removes all non-unique list items
L - Gets length of list
‹ - Decrements by 1, to account for extra list item for first symbol
PHP 8.x, 87 bytes
This is a quite long piece of code...
fn($z)=>count(array_flip(array_filter(array_map('strtolower',preg_split('/\W+/',$z)))))
It's almost self-explanatory.
It's so void of PHP-only tricks that it is so trivial to re-implement it into JavaScript!
// PHP-like code
let fn = ($z)=>count(array_flip(array_filter(array_map('strtolower',preg_split('\\W+',$z)))));
// Test boilerplate that has enough functionality
function count(value) {
return value.length;
}
function strtolower(str) {
return str.toLowerCase();
}
function preg_split(regex, subject) {
return subject.split(new RegExp(regex));
}
function array_filter(array) {
return array.filter(function(value){
return value !== '';
});
}
function array_flip(array) {
return Array.from(new Set(array));
}
function array_map(fn, array) {
return array.map(typeof fn === 'string' ? window[fn] : fn);
}
// Event handler - does the basic output thing
text.oninput = function(){
output.innerText = fn(this.value);
};
<input type="text" id="text"/>
<p>Output: <span id="output">--</span></p>
Differences
There aren't a lot of differences between the PHP and JavaScript versions:
- The
fn($z)=>[...]has to be written without thefnbit. - The regular expression has to be escaped and can't have the slashes.
This means it changes from/\W+/to\\W+. - The function
array_fliponly returns a set of all unique values.
The PHP function returns the array with the keys set from the values.
That is, an array like['a', 'fox', 'a', 'car']will be returned as['a' => 2, 'fox' => 1, 'car' => 3]while JavaScript returns['a', 'fox', 'car'].
The end result is the same: an array that has the same number of unique elements.
These differences won't affect the accuracy of the results.
But, it's worth to deal with them to give you an improved testing environment.
Japt, 14 bytes
Yet another one of those occasions I regret suggesting the removal of _ from the \w RegEx class in Japt!
f"[%w_]+" üv l
f"[%w_]+" üv l :Implicit input of string
f :Match
"[%w_]+" : RegEx /[a-z0-9_]/gi
ü :Group & sort by
v : Lowercase
l :Length
Raku, 21 bytes
+*.lc.comb(/\w+/).Set
This is an anonymous function. The argument (*) is converted to lowercase (.lc), then the substrings matching one or more word characters are extracted (.comb(/\w+/)) and converted to a set (.Set), which discards the duplicates. Finally, that set is expressed as a number (+), yielding its size.
Java (JDK), 83 bytes
s->java.util.Arrays.stream((" "+s).toLowerCase().split("\\W")).distinct().count()-1
Saved 17 bytes thanks to Neil.
Charcoal, 31 bytes
≔⦃⦄θFΦ⪪↧S⁻⪪γ¹⊞OE³⁶⍘ιφ_ι§≔θιιILθ
Attempt This Online! Link is to verbose version of code. Explanation: Inspired by @KevinCruijssen's 05AB1E answer, so assumes the input only contains printable ASCII.
≔⦃⦄θ
Start with an empty dictionary.
FΦ⪪↧S⁻⪪γ¹⊞OE³⁶⍘ιφ_ι
Split the lowercased input on all printable ASCII except the characters used to encode base 36 and _, and...
§≔θιι
... set each word as a key in the dictionary.
ILθ
Output the final length of the dictionary.
Retina 0.8.2, 15 bytes
T`L`l
D`\w+
\w+
Try it online! Link includes test cases. Explanation:
T`L`l
Convert to lower case.
D`\w+
Deduplicate words.
\w+
Count the number of remaining words.
13 bytes in Retina 1:
D$`\w+
$l
\w+
Try it online! Link includes test cases. Explanation:
D$`\w+
$l
Deduplicate words by lowercased value.
\w+
Count the number of remaining words.
05AB1E, 12 bytes
žQžjмS¡õKlÙg
Assumes the input will only contain printable ASCII characters.
Try it online or verify all test cases.
Explanation:
žQ # Push the constant string with all printable ASCII characters
žj # Push the constant string with "a-zA-Z0-9_"
м # Remove all those characters
S # Convert the string to a list of characters
¡ # Split the (implicit) input-string by each of those characters
õK # Remove any empty strings from the list
l # Convert each word to lowercase
Ù # Uniquify the list of lowercase words
g # Pop and push the length
# (which is output implicitly as result)
APL (Dyalog Extended), 27 bytes
f←{⍴∪(⎕A,⎕D,'_')(∊⍨⊆⊢)1⎕C⍵}
f←{⍴∪(⎕A,⎕D,'_')(∊⍨⊆⊢)1⎕C⍵}
1⎕C to uppercase
(⎕A,⎕D,'_') [A-Z0-9_]
(⎕A,⎕D,'_')(∊⍨⊆⊢)1⎕C Partition (⊆) using Membership (∊)
∪ remove duplicates
⍴ count words
Excel, 120 bytes
=LET(
x,MID(UPPER(A1),ROW(A:A),1),
ROWS(
UNIQUE(
TEXTSPLIT(A1,,
IF((ABS(77.5-CODE(x&"Z"))<13)+1-ISERR(0+x)+(x="_")=0,x),1
)
)
)
)
Vyxal, 7 bytes
`†`ẎɽUL
`†`Ẏ # find all matches of \w+
ɽ # to lowercase
U # uniquify
L # length
With a flag:
Vyxal l, 6 bytes
`†`ẎɽU
Thunno, \$ 11 \log_{256}(96) \approx \$ 9.05 bytes
u"\w+"AfZUL
Same approach as basically every other answer.
u"\w+"AfZUL # Implicit input
u # Uppercase
"\w+"Af # Regex findall "\w+"
ZU # Uniquify
L # Length
PowerShell Core, 49 bytes
($args-split'\W'-ne''|%{$_|% *l*r}|sort|gu).Count
($args-split'\W'-ne''|%{$_|% *l*r}|sort|gu).Count # full function
$args-split'\W'-ne'' # Splits the input string on non words character and remove empty entries
|%{$_|% *l*r} # Calls ToLower() on each of the words
|sort|gu # Get unique words, Get-Unique needs the list to be sorted to remove all duplicates
( ).Count # Return the count
Zsh, 36 bytes
<<<${#${(u)=${1:l}//[^[:IDENT:]]/ }}
Fortunately, the [:IDENT:] character class is exactly the words we should keep.
<<<${#${(u)=${1:l}//[^[:IDENT:]]/ }}
${1:l} # lowercase string
${ //[^[:IDENT:]]/ } # // replace non-[:IDENT:] with spaces
${ = } # = split on $IFS (space/tab/newline
${(u) } # keep first occurance of each word
${# } # count
<<< # print