/*************************************************************************/ /* */ /* Copyright (c) 1997-98 Richard Tobin, Language Technology Group, HCRC, */ /* University of Edinburgh. */ /* */ /* THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, */ /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ /* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ /* IN NO EVENT SHALL THE AUTHOR OR THE UNIVERSITY OF EDINBURGH BE LIABLE */ /* FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF */ /* CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION */ /* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* */ /*************************************************************************/ #ifdef FOR_LT #include "lt-defs.h" #include "lt-memory.h" #include "lt-errmsg.h" #include "lt-comment.h" #include "lt-safe.h" #include "nsl-err.h" #define Strerror() strErr() #define Malloc salloc #define Realloc srealloc #define Free sfree #define fopen stdsfopen #else #include "system.h" #define LT_ERROR(err, format) fprintf(stderr, format) #define LT_ERROR1(err, format, arg) fprintf(stderr, format, arg) #define LT_ERROR2(err, format, arg1, arg2) fprintf(stderr, format, arg1, arg2) #define LT_ERROR3(err, format, arg1, arg2, arg3) fprintf(stderr, format, arg1, arg2, arg3) #define WARN(err, format) fprintf(stderr, format) #define WARN1(err, format, arg) fprintf(stderr, format, arg) #define Strerror() strerror(errno) #ifdef MAXPATHLEN #define CWDBS MAXPATHLEN+1 #else #define CWDBS 1025 #endif #define GETWD(buf) getcwd(buf,CWDBS) #endif /* FOR_LT */ #include #include #include #include #include #include /* that's where strerror is. really. */ #include #ifdef WIN32 #include #endif #ifdef SOCKETS_IMPLEMENTED #ifdef WIN32 #undef boolean #include #include #else #include #include #include #include #endif #endif #include "string16.h" #include "stdio16.h" #include "url.h" #ifdef HAVE_LIBZ #include "zlib.h" #ifdef macintosh #include #include #endif #endif static FILE16 *http_open(const char *url, const char *host, int port, const char *path, const char *type); static FILE16 *file_open(const char *url, const char *host, int port, const char *path, const char *type); static void parse_url(const char *url, char **scheme, char **host, int *port, char **path); /* Mapping of scheme names to opening functions */ struct { char *scheme; FILE16 *(*open)(const char *, const char *, int, const char *, const char *); } schemes[] = { {(char *)"http", http_open}, {(char *)"file", file_open}, }; #define NSCHEME (sizeof(schemes) / sizeof(schemes[0])) /* Construct a default base URL, essentially file:`pwd`/ */ char *default_base_url(void) { char buf[CWDBS]; char *url; if(!GETWD(buf)) { WARN(LEFILE, "Warning: can't get current directory for default base url\n"); return strdup8("file:/"); } #ifdef WIN32 /* DOS: translate C:\a\b to file:/C:/a/b/ */ /* XXX should we escape anything? */ { char *p; for(p=buf; *p; p++) if(*p == '\\') *p = '/'; } url = Malloc(6 + strlen(buf) + 2); sprintf(url, "file:/%s/", buf); #else #ifdef mac_filenames /* Mac: translate a:b to file:/a/b/ */ /* XXX should escape spaces and slashes, at least */ { char *p; for(p=buf; *p; p++) if(*p == ':') *p = '/'; /* Mac getcwd (always?) has a trailing separator, which we here bash */ if(*--p == '/') *p = 0; } url = Malloc(6 + strlen(buf) + 2); sprintf(url, "file:/%s/", buf); #else /* Unix: translate /a/b to file:/a/b/ */ url = Malloc(5 + strlen(buf) + 2); sprintf(url, "file:%s/", buf); #endif #endif return url; } /* * Merge a URL with a base URL if necessary. * The merged URL is returned. * The parts of the URL are returned in scheme, host, port and path * if these are non-null. * Caller should free the results. */ char *url_merge(const char *url, const char *base, char **_scheme, char **_host, int *_port, char **_path) { char *merged_scheme, *merged_host, *merged_path, *merged_url; char *scheme=0, *host=0, *path=0; char *base_scheme=0, *base_host=0, *base_path=0; char *default_base=0; int port, base_port, merged_port, i, j; char *p; /* First see if we have an absolute URL */ parse_url(url, &scheme, &host, &port, &path); if(scheme && (host || *path == '/')) { merged_scheme = scheme; merged_host = host; merged_port = port; merged_path = path; merged_url = strdup8(url); goto ok; } /* Relative URL, so we need the base URL */ if(!base) base = default_base = default_base_url(); parse_url(base, &base_scheme, &base_host, &base_port, &base_path); if(base_scheme && (base_host || *base_path == '/')) ; else { LT_ERROR1(LEFILE, "Error: bad base URL <%s>\n", base); goto bad; } /* Determine merged path */ if(path[0] == '/') { /* not relative, use as-is */ merged_path = path; path = 0; } else { /* relative, append to base path */ merged_path = Malloc(strlen(base_path) + strlen(path) + 1); strcpy(merged_path, base_path); /* strip last component of base */ for(i=strlen(merged_path)-1; i>=0 && merged_path[i] != '/'; i--) merged_path[i] = '\0'; /* append relative path */ strcat(merged_path, path); /* Remove . and .. components from path */ p = merged_path; for(i=0; p[i]; ) { assert(p[i] == '/'); /* find next segment */ for(j=i+1; p[j] && p[j] != '/'; j++) ; /* Do we have "." ? */ if(j - i == 2 && p[i+1] == '.') { strcpy(&p[i+1], p[j] ? &p[j+1] : &p[j]); continue; } /* Do we have "/.." with != ".." ? */ /* (We know we're not looking at "./" so we don't have to * worry about "./..") */ if(p[j] == '/' && p[j+1] == '.' && p[j+2] == '.' && (p[j+3] == '/' || p[j+3] == '\0') && (j - i != 3 || p[i+1] != '.' || p[i+2] != '.')) { strcpy(&p[i+1], p[j+3] ? &p[j+4] : &p[j+3]); i = 0; /* start again from beginning */ continue; } /* move to next segment */ i = j; } } /* Check for deviant relative URLs like file:foo */ if(scheme && !host && *path != '/') { if(strcmp(scheme, base_scheme) == 0) { WARN1(LEFILE, "Warning: relative URL <%s> contains scheme, contrary to RFC 1808\n", url); } else { LT_ERROR2(LEFILE, "Error: relative URL <%s> has scheme different from base <%s>\n", url, base); goto bad; } } /* Return the parts and the whole thing */ merged_scheme = base_scheme; if(scheme) Free(scheme); if(host) { merged_host = host; Free(base_host); merged_port = port; } else { merged_host = base_host; merged_port = base_port; } Free(path); Free(base_path); merged_url = Malloc(strlen(merged_scheme) + 1 + (merged_host ? 2 + strlen(merged_host) + 10 : 0) + strlen(merged_path) + 1); if(merged_host) { if(merged_port == -1) sprintf(merged_url, "%s://%s%s", merged_scheme, merged_host, merged_path); else sprintf(merged_url, "%s://%s:%d%s", merged_scheme, merged_host, merged_port, merged_path); } else sprintf(merged_url, "%s:%s", merged_scheme, merged_path); ok: Free(default_base); if(_scheme) *_scheme = merged_scheme; else Free(merged_scheme); if(_host) *_host = merged_host; else Free(merged_host); if(_port) *_port = merged_port; if(_path) *_path = merged_path; else Free(merged_path); return merged_url; bad: Free(default_base); Free(scheme); Free(host); Free(path); Free(base_scheme); Free(base_host); Free(base_path); return NULL; } /* * Open a stream to a URL. * url may be a relative URL, in which case it is merged with base, * which is typically the URL of the containing document. If base * is null, file:`pwd`/ is used, which is the right thing to do for * filenames. If base is "", there is no base URL and relative * URLs will fail. * If merged_url is non-null the resulting URL is stored in it. * If type begins "r", the URL is opened for reading, if "w" for * writing. Writing is only supported for file URLs. * If the type begins "rl", the data will be copied to a temporary * file so that seeking is possible (NOT YET IMPLEMENTED). * Returns a FILE16 for success, NULL for failure. */ FILE16 *url_open(const char *url, const char *base, const char *type, char **merged_url) { char *scheme, *host, *path, *m_url; int port, i; FILE16 *f; #ifdef HAVE_LIBZ int len, gzipped = 0; #endif /* Determine the merged URL */ if(!(m_url = url_merge(url, base, &scheme, &host, &port, &path))) return 0; #ifdef HAVE_LIBZ len = strlen(m_url); if(len > 3 && strcmp8(m_url+len-3, ".gz") == 0) gzipped = 1; #endif /* printf("<%s> <%s> <%d> <%s>\n", scheme, host ? host : "", port, path); printf("%s\n", m_url); */ /* Pass to the appropriate opening function */ for(i=0; ih_addr, hostent->h_length); addr.sin_port = htons((u_short)(port == -1 ? 80 : port)); /* Connect */ if(connect(s, (struct sockaddr *)&addr, sizeof(addr)) == -1) { LT_ERROR1(LEFILE, "Error: system call connect failed: %s\n", Strerror()); return 0; } #ifndef WIN32 #ifdef macintosh fin = fdopen(s, "rb"); setvbuf(fin, 0, _IONBF, 0); fout = fdopen(dup(s), "wb"); #else fin = fdopen(s, "r"); setvbuf(fin, 0, _IONBF, 0); fout = fdopen(dup(s), "w"); #endif #endif /* Send the request */ /* * Apparently on the Macintosh, \n might not be ASCII LF, so we'll * use numerics to be sure. */ #ifdef WIN32 sprintf(buf, "GET %s HTTP/1.0\012\015Connection: close\012\015\012\015", path); if (send(s,buf,strlen8(buf),0)==SOCKET_ERROR) { LT_ERROR1(LEFILE, "Error: system call socket failed: %d\n", WSAGetLastError()); /* XXX close the socket? */ return 0; }; #else fprintf(fout, "GET %s HTTP/1.0\012\015Connection: close\012\015\012\015", path); /* We used to test for errors after doing fclose, but this seemed to produce spurious errors under Linux (RedHat 4.2), so now we do fflush and test after that. */ fflush(fout); if(ferror(fout)) { LT_ERROR1(LEWRTF, "Error: write to socket failed: %s\n",Strerror()); fclose(fout); fclose(fin); return 0; } fclose(fout); #endif /* Read the status line */ #ifdef WIN32 for(i=0; i url && *p == ':') { *scheme = Malloc(p - url + 1); strncpy(*scheme, url, p - url); (*scheme)[p - url] = '\0'; url = p+1; } /* Does it have a net_loc? */ if(url[0] == '/' && url[1] == '/') { url += 2; for(p = (char *)url; *p; p++) if(*p == '/') break; /* Does it have a port number? */ for(q = p-1; q >= url; q--) if(!isdigit((int)*q)) break; if(q < p-1 && *q == ':') *port = atoi(q+1); else q = p; *host = Malloc(q - url + 1); strncpy(*host, url, q - url); (*host)[q - url] = '\0'; url = p; } /* The rest is the path */ if(*url) *path = strdup8(url); else *path = strdup8("/"); /* Windoze users have a tendency to use backslashes instead of slashes */ for(p=*path; *p; p++) if(*p == '\\') { if(!warned) { WARN1(LEFILE, "Warning: illegal backslashes in URL path \"%s\"" "replaced by slashes\n", url); warned = 1; } *p = '/'; } }